From 1884a7d5eb875a2fcdce4f225953f7ae9e3e2ead Mon Sep 17 00:00:00 2001 From: Jing chen He Date: Tue, 9 Jun 2026 22:01:44 -0700 Subject: [PATCH 1/3] fix(java): expose updatedFragmentOffsets on Update operation for RewriteColumns commits --- java/lance-jni/src/error.rs | 6 ++ java/lance-jni/src/transaction.rs | 77 ++++++++++++++++- .../main/java/org/lance/operation/Update.java | 64 +++++++++++++- .../java/org/lance/operation/UpdateTest.java | 84 +++++++++++++++++++ 4 files changed, 224 insertions(+), 7 deletions(-) diff --git a/java/lance-jni/src/error.rs b/java/lance-jni/src/error.rs index cdb922a3cef..e27abd4fda6 100644 --- a/java/lance-jni/src/error.rs +++ b/java/lance-jni/src/error.rs @@ -236,6 +236,12 @@ impl From for Error { } } +impl From for Error { + fn from(err: std::io::Error) -> Self { + Self::io_error(err.to_string()) + } +} + impl From for Error { fn from(err: Utf8Error) -> Self { Self::input_error(err.to_string()) diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index 6bc1948ae6a..ce0f3f2416b 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -19,7 +19,7 @@ use jni::sys::{jboolean, jint, jlong}; use lance::dataset::CommitBuilder; use lance::dataset::transaction::{ DataReplacementGroup, Operation, RewriteGroup, RewrittenIndex, Transaction, TransactionBuilder, - UpdateMap, UpdateMapEntry, UpdateMode, + UpdateMap, UpdateMapEntry, UpdateMode, UpdatedFragmentOffsets, }; use lance::io::ObjectStoreParams; use lance::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore; @@ -433,7 +433,7 @@ fn convert_to_java_operation_inner<'local>( fields_for_preserving_frag_bitmap, update_mode, inserted_rows_filter: _, - updated_fragment_offsets: _, + updated_fragment_offsets, } => { let removed_ids: Vec> = removed_fragment_ids .iter() @@ -457,9 +457,44 @@ fn convert_to_java_operation_inner<'local>( &[JValue::Object(&update_mode)], )? .l()?; + // Serialize updated_fragment_offsets to Java Map. + // Values are portable RoaringBitmap bytes so the JNI boundary stays O(bitmap size) + // rather than O(n rows). Empty HashMap when None so the Java constructor always + // receives a non-null map. A per-iteration local frame (capacity 4: Long + byte[] + + // put return + slack) bounds local-ref growth for large offset maps. + let java_offsets_map = { + let java_map = env.new_object("java/util/HashMap", "()V", &[])?; + if let Some(UpdatedFragmentOffsets(ref map)) = updated_fragment_offsets { + for (frag_id, bitmap) in map { + let mut buf: Vec = Vec::new(); + bitmap.serialize_into(&mut buf)?; + // JNI byte arrays are signed i8; reinterpret without copying. + let buf_i8: &[i8] = unsafe { + std::slice::from_raw_parts(buf.as_ptr() as *const i8, buf.len()) + }; + env.with_local_frame(4, |env| { + let java_key = env.new_object( + "java/lang/Long", + "(J)V", + &[JValue::Long(*frag_id as i64)], + )?; + let java_arr = env.new_byte_array(buf_i8.len() as i32)?; + env.set_byte_array_region(&java_arr, 0, buf_i8)?; + env.call_method( + &java_map, + "put", + "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;", + &[JValue::Object(&java_key), JValue::Object(&*java_arr)], + )?; + Ok::(JObject::null()) + })?; + } + } + java_map + }; Ok(env.new_object( "org/lance/operation/Update", - "(Ljava/util/List;Ljava/util/List;Ljava/util/List;[J[JLjava/util/Optional;)V", + "(Ljava/util/List;Ljava/util/List;Ljava/util/List;[J[JLjava/util/Optional;Ljava/util/Map;)V", &[ JValue::Object(&removed_fragment_ids_obj), JValue::Object(&updated_fragments_obj), @@ -467,6 +502,7 @@ fn convert_to_java_operation_inner<'local>( JValueGen::Object(&fields_modified), JValueGen::Object(&fields_for_preserving_frag_bitmap), JValue::Object(&update_mode_optional), + JValue::Object(&java_offsets_map), ], )?) } @@ -1232,6 +1268,39 @@ fn convert_to_rust_operation( update_mode.extract_object(env) })?; + let updated_fragment_offsets = { + let offsets_obj = env + .call_method( + java_operation, + "updatedFragmentOffsets", + "()Ljava/util/Map;", + &[], + )? + .l()?; + if offsets_obj.is_null() { + None + } else { + let jmap = JMap::from_env(env, &offsets_obj)?; + let mut iter = jmap.iter(env)?; + let mut offsets: HashMap = HashMap::new(); + env.with_local_frame(32, |env| { + while let Some((key, value)) = iter.next(env)? { + let frag_id = + env.call_method(&key, "longValue", "()J", &[])?.j()? as u64; + let buf: Vec = env.convert_byte_array(JByteArray::from(value))?; + let bitmap = RoaringBitmap::deserialize_from(buf.as_slice())?; + offsets.insert(frag_id, bitmap); + } + Ok::<(), Error>(()) + })?; + if offsets.is_empty() { + None + } else { + Some(UpdatedFragmentOffsets(offsets)) + } + } + }; + Operation::Update { removed_fragment_ids, updated_fragments, @@ -1241,7 +1310,7 @@ fn convert_to_rust_operation( fields_for_preserving_frag_bitmap, update_mode, inserted_rows_filter: None, - updated_fragment_offsets: None, + updated_fragment_offsets, } } "DataReplacement" => { diff --git a/java/src/main/java/org/lance/operation/Update.java b/java/src/main/java/org/lance/operation/Update.java index f886942b4b9..721bbd84b47 100644 --- a/java/src/main/java/org/lance/operation/Update.java +++ b/java/src/main/java/org/lance/operation/Update.java @@ -20,6 +20,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Optional; @@ -31,19 +32,30 @@ public class Update implements Operation { private final long[] fieldsForPreservingFragBitmap; private final Optional updateMode; + /** + * Per-fragment matched row offsets serialized as portable RoaringBitmap bytes (little-endian, + * spec-compliant). Keys are fragment ids; values are the serialized bitmap for the local physical + * row offsets (0-based) within the fragment whose columns were rewritten. Empty map means the + * caller did not supply offsets and the partial last_updated refresh in build_manifest will not + * activate. + */ + private final Map updatedFragmentOffsets; + private Update( List removedFragmentIds, List updatedFragments, List newFragments, long[] fieldsModified, long[] fieldsForPreservingFragBitmap, - Optional updateMode) { + Optional updateMode, + Map updatedFragmentOffsets) { this.removedFragmentIds = removedFragmentIds; this.updatedFragments = updatedFragments; this.newFragments = newFragments; this.fieldsModified = fieldsModified; this.fieldsForPreservingFragBitmap = fieldsForPreservingFragBitmap; this.updateMode = updateMode; + this.updatedFragmentOffsets = updatedFragmentOffsets; } public static Builder builder() { @@ -74,6 +86,10 @@ public Optional updateMode() { return updateMode; } + public Map updatedFragmentOffsets() { + return updatedFragmentOffsets; + } + @Override public String name() { return "Update"; @@ -87,6 +103,7 @@ public String toString() { .add("fieldsModified", fieldsModified) .add("fieldsForPreservingFragBitmap", fieldsForPreservingFragBitmap) .add("updateMode", updateMode) + .add("updatedFragmentOffsets", updatedFragmentOffsets) .toString(); } @@ -100,7 +117,32 @@ public boolean equals(Object o) { && Objects.equals(newFragments, that.newFragments) && Arrays.equals(fieldsModified, that.fieldsModified) && Arrays.equals(fieldsForPreservingFragBitmap, that.fieldsForPreservingFragBitmap) - && Objects.equals(updateMode, that.updateMode); + && Objects.equals(updateMode, that.updateMode) + && offsetMapsEqual(updatedFragmentOffsets, that.updatedFragmentOffsets); + } + + /** Deep-equality for {@code Map}: keys by value, arrays by content. */ + private static boolean offsetMapsEqual(Map a, Map b) { + if (a == b) return true; + if (a.size() != b.size()) return false; + for (Map.Entry entry : a.entrySet()) { + if (!Arrays.equals(entry.getValue(), b.get(entry.getKey()))) return false; + } + return true; + } + + @Override + public int hashCode() { + int h = Objects.hash(removedFragmentIds, updatedFragments, newFragments, updateMode); + h = 31 * h + Arrays.hashCode(fieldsModified); + h = 31 * h + Arrays.hashCode(fieldsForPreservingFragBitmap); + // Sum entry hashes (XOR key ^ array-content hash) so result is insertion-order-independent. + int mapHash = 0; + for (Map.Entry entry : updatedFragmentOffsets.entrySet()) { + mapHash += Long.hashCode(entry.getKey()) ^ Arrays.hashCode(entry.getValue()); + } + h = 31 * h + mapHash; + return h; } public enum UpdateMode { @@ -115,6 +157,7 @@ public static class Builder { private long[] fieldsModified = new long[0]; private long[] fieldsForPreservingFragBitmap = new long[0]; private Optional updateMode = Optional.empty(); + private Map updatedFragmentOffsets = Collections.emptyMap(); private Builder() {} @@ -148,6 +191,20 @@ public Builder updateMode(Optional updateMode) { return this; } + /** + * Set the per-fragment matched row offsets for a RewriteColumns commit. + * + *

Keys are fragment ids; values are portable RoaringBitmap bytes (little-endian, + * spec-compliant serialization) encoding the local physical row offsets (0-based) within the + * fragment that matched the update_columns hash join. When non-empty and update mode is + * RewriteColumns with stable row IDs enabled, build_manifest will call the partial last_updated + * refresh for those offsets only. + */ + public Builder updatedFragmentOffsets(Map updatedFragmentOffsets) { + this.updatedFragmentOffsets = updatedFragmentOffsets; + return this; + } + public Update build() { return new Update( removedFragmentIds, @@ -155,7 +212,8 @@ public Update build() { newFragments, fieldsModified, fieldsForPreservingFragBitmap, - updateMode); + updateMode, + updatedFragmentOffsets); } } } diff --git a/java/src/test/java/org/lance/operation/UpdateTest.java b/java/src/test/java/org/lance/operation/UpdateTest.java index bb39a5f4d12..120689e8798 100644 --- a/java/src/test/java/org/lance/operation/UpdateTest.java +++ b/java/src/test/java/org/lance/operation/UpdateTest.java @@ -36,10 +36,14 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Optional; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertThrows; public class UpdateTest extends OperationTestBase { @@ -104,6 +108,86 @@ void testUpdate(@TempDir Path tempDir) throws Exception { } } + @Test + void testUpdatedFragmentOffsetsRoundTrip(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("testUpdatedFragmentOffsetsRoundTrip").toString(); + try (RootAllocator allocator = new RootAllocator(Long.MAX_VALUE)) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + dataset = testDataset.createEmptyDataset(); + + // Append an initial fragment so we have a real fragment id. + FragmentMetadata fragmentMeta = testDataset.createNewFragment(10); + try (Transaction appendTxn = + new Transaction.Builder() + .readVersion(dataset.version()) + .operation( + Append.builder().fragments(Collections.singletonList(fragmentMeta)).build()) + .build()) { + new CommitBuilder(dataset).execute(appendTxn).close(); + } + + dataset = Dataset.open(datasetPath, allocator); + long fragmentId = dataset.getFragments().get(0).getId(); + FragmentMetadata newFragment = testDataset.createNewFragment(10); + + // Build Update with non-empty updatedFragmentOffsets. Values are portable RoaringBitmap + // bytes encoding {1, 3, 5}: cookie(4) + containerCount(4) + key(2) + card-1(2) + + // offset(4) + elems(6). Offset = 16 (start of container data from beginning of stream). + Map offsets = new HashMap<>(); + offsets.put( + fragmentId, + new byte[] { + (byte) 0x3A, + (byte) 0x30, + (byte) 0x00, + (byte) 0x00, // cookie = 12346 + (byte) 0x01, + (byte) 0x00, + (byte) 0x00, + (byte) 0x00, // 1 container + (byte) 0x00, + (byte) 0x00, // container key 0 + (byte) 0x02, + (byte) 0x00, // cardinality - 1 = 2 + (byte) 0x10, + (byte) 0x00, + (byte) 0x00, + (byte) 0x00, // offset = 16 + (byte) 0x01, + (byte) 0x00, // element 1 + (byte) 0x03, + (byte) 0x00, // element 3 + (byte) 0x05, + (byte) 0x00 // element 5 + }); + + try (Transaction updateTxn = + new Transaction.Builder() + .readVersion(dataset.version()) + .operation( + Update.builder() + .removedFragmentIds(Collections.singletonList(fragmentId)) + .newFragments(Collections.singletonList(newFragment)) + .updateMode(Optional.of(UpdateMode.RewriteRows)) + .updatedFragmentOffsets(offsets) + .build()) + .build()) { + try (Dataset committed = new CommitBuilder(dataset).execute(updateTxn)) { + // Read the committed transaction back (exercises the IntoJava JNI path). + try (Transaction readTx = committed.readTransaction().orElseThrow()) { + assertInstanceOf(Update.class, readTx.operation()); + Update readOp = (Update) readTx.operation(); + + Map readOffsets = readOp.updatedFragmentOffsets(); + assertEquals(1, readOffsets.size()); + assertArrayEquals(offsets.get(fragmentId), readOffsets.get(fragmentId)); + } + } + } + } + } + @Test void testUpdateColumns(@TempDir Path tempDir) throws Exception { String datasetPath = tempDir.resolve("testUpdateColumns").toString(); From 61dad6a451fdd918b98dfa627ceed9f562a57676 Mon Sep 17 00:00:00 2001 From: Jing chen He Date: Fri, 19 Jun 2026 22:45:05 -0700 Subject: [PATCH 2/3] fix(java): bound JNI locals and classify bad offset bitmap bytes as input errors --- java/lance-jni/src/error.rs | 6 ---- java/lance-jni/src/transaction.rs | 48 +++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/java/lance-jni/src/error.rs b/java/lance-jni/src/error.rs index e27abd4fda6..cdb922a3cef 100644 --- a/java/lance-jni/src/error.rs +++ b/java/lance-jni/src/error.rs @@ -236,12 +236,6 @@ impl From for Error { } } -impl From for Error { - fn from(err: std::io::Error) -> Self { - Self::io_error(err.to_string()) - } -} - impl From for Error { fn from(err: Utf8Error) -> Self { Self::input_error(err.to_string()) diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index ce0f3f2416b..f8b88f069a5 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -460,14 +460,18 @@ fn convert_to_java_operation_inner<'local>( // Serialize updated_fragment_offsets to Java Map. // Values are portable RoaringBitmap bytes so the JNI boundary stays O(bitmap size) // rather than O(n rows). Empty HashMap when None so the Java constructor always - // receives a non-null map. A per-iteration local frame (capacity 4: Long + byte[] + - // put return + slack) bounds local-ref growth for large offset maps. + // receives a non-null map. let java_offsets_map = { let java_map = env.new_object("java/util/HashMap", "()V", &[])?; if let Some(UpdatedFragmentOffsets(ref map)) = updated_fragment_offsets { for (frag_id, bitmap) in map { let mut buf: Vec = Vec::new(); - bitmap.serialize_into(&mut buf)?; + bitmap.serialize_into(&mut buf).map_err(|e| { + Error::runtime_error(format!( + "failed to serialize updatedFragmentOffsets for fragment \ + {frag_id}: {e}" + )) + })?; // JNI byte arrays are signed i8; reinterpret without copying. let buf_i8: &[i8] = unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const i8, buf.len()) @@ -1283,16 +1287,36 @@ fn convert_to_rust_operation( let jmap = JMap::from_env(env, &offsets_obj)?; let mut iter = jmap.iter(env)?; let mut offsets: HashMap = HashMap::new(); - env.with_local_frame(32, |env| { - while let Some((key, value)) = iter.next(env)? { - let frag_id = - env.call_method(&key, "longValue", "()J", &[])?.j()? as u64; - let buf: Vec = env.convert_byte_array(JByteArray::from(value))?; - let bitmap = RoaringBitmap::deserialize_from(buf.as_slice())?; - offsets.insert(frag_id, bitmap); + // Per-iteration local frame: iterator key/value JNI refs are released each + // loop so large multi-fragment maps cannot exhaust the local reference table. + loop { + let entry = env.with_local_frame( + 8, + |env| -> Result> { + let Some((key, value)) = iter.next(env)? else { + return Ok(None); + }; + let frag_id = + env.call_method(&key, "longValue", "()J", &[])?.j()? as u64; + let buf: Vec = + env.convert_byte_array(JByteArray::from(value))?; + let bitmap = RoaringBitmap::deserialize_from(buf.as_slice()) + .map_err(|e| { + Error::input_error(format!( + "invalid updatedFragmentOffsets RoaringBitmap bytes \ + for fragment {frag_id}: {e}" + )) + })?; + Ok(Some((frag_id, bitmap))) + }, + )?; + match entry { + None => break, + Some((frag_id, bitmap)) => { + offsets.insert(frag_id, bitmap); + } } - Ok::<(), Error>(()) - })?; + } if offsets.is_empty() { None } else { From 77683939a20328793768d95b2f6765b94f1ed12e Mon Sep 17 00:00:00 2001 From: Jing chen He Date: Sat, 20 Jun 2026 15:15:22 -0700 Subject: [PATCH 3/3] perf(java): return matched row offsets as Roaring bytes from FragmentUpdateResult Serialize matched_offsets once at the executor JNI boundary instead of expanding to long[]. Add getUpdatedRowOffsetBytes(); deprecate getUpdatedRowOffsets() with expandRowOffsetsFromBytes for #6650 compat. Follow-up to #6650. Pairs with Update.updatedFragmentOffsets() for lance-spark#418. Co-Authored-By: Claude Sonnet 4.6 --- java/lance-jni/Cargo.lock | 1 + java/lance-jni/src/fragment.rs | 110 ++++++++++++++++-- .../lance/fragment/FragmentUpdateResult.java | 51 ++++++-- .../fragment/FragmentUpdateResultTest.java | 106 +++++++++++++++++ 4 files changed, 252 insertions(+), 16 deletions(-) create mode 100644 java/src/test/java/org/lance/fragment/FragmentUpdateResultTest.java diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index ee52544ba57..a95124e1e98 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -4120,6 +4120,7 @@ dependencies = [ "lance-core", "num-traits", "rand 0.9.4", + "rayon", ] [[package]] diff --git a/java/lance-jni/src/fragment.rs b/java/lance-jni/src/fragment.rs index d6603925947..6470bbe61aa 100644 --- a/java/lance-jni/src/fragment.rs +++ b/java/lance-jni/src/fragment.rs @@ -5,7 +5,7 @@ use arrow::array::{RecordBatch, RecordBatchIterator, StructArray}; use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema, from_ffi_and_data_type}; use arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream}; use arrow_schema::{DataType, Schema as ArrowSchema}; -use jni::objects::{JIntArray, JValue, JValueGen}; +use jni::objects::{JByteArray, JIntArray, JValue, JValueGen}; use jni::{ JNIEnv, objects::{JClass, JLongArray, JObject, JString}, @@ -19,6 +19,8 @@ use lance_io::utils::CachedFileSize; use lance_table::rowids::{RowIdSequence, write_row_ids}; use std::iter::once; +use roaring::RoaringBitmap; + use lance::dataset::fragment::write::FragmentCreateBuilder; use lance::io::ObjectStoreParams; use lance_datafusion::utils::StreamingWriteSource; @@ -48,8 +50,8 @@ pub(crate) struct FragmentMergeResult { pub(crate) struct FragmentUpdateResult { updated_fragment: Fragment, fields_modified: Vec, - /// Physical row offsets that received column updates (from `_rowaddr` low bits). - updated_row_offsets: Vec, + /// Matched row offsets serialized as portable RoaringBitmap bytes. + updated_row_offset_bytes: Vec, } ////////////////// @@ -539,15 +541,106 @@ fn inner_update_column<'local>( let right_on_str: String = right_on.extract(env)?; let r = RT.block_on(fragment.update_columns_with_offsets(reader, &left_on_str, &right_on_str))?; - let updated_row_offsets: Vec = r.matched_offsets.iter().map(|o| o as i64).collect(); + let updated_row_offset_bytes = serialize_matched_offsets(&r.matched_offsets)?; let result = FragmentUpdateResult { updated_fragment: r.fragment, fields_modified: r.fields_modified, - updated_row_offsets, + updated_row_offset_bytes, }; result.into_java(env) } +fn serialize_matched_offsets(bitmap: &RoaringBitmap) -> Result> { + let mut buf = Vec::new(); + bitmap.serialize_into(&mut buf).map_err(|e| { + Error::runtime_error(format!( + "failed to serialize matched row offsets RoaringBitmap: {e}" + )) + })?; + Ok(buf) +} + +fn deserialize_row_offset_bytes(bytes: &[u8]) -> Result { + if bytes.is_empty() { + return Ok(RoaringBitmap::new()); + } + RoaringBitmap::deserialize_from(bytes).map_err(|e| { + Error::input_error(format!( + "invalid updatedRowOffsetBytes RoaringBitmap bytes: {e}" + )) + }) +} + +fn expand_row_offset_bytes_to_i64(bitmap: &RoaringBitmap) -> Vec { + bitmap.iter().map(|o| o as i64).collect() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_fragment_FragmentUpdateResult_expandRowOffsetsFromBytes< + 'local, +>( + mut env: JNIEnv<'local>, + _cls: JClass, + jbytes: JByteArray, +) -> JLongArray<'local> { + ok_or_throw_with_return!( + env, + inner_expand_updated_row_offset_bytes(&mut env, jbytes), + unsafe { JLongArray::from_raw(std::ptr::null_mut()) } + ) +} + +fn inner_expand_updated_row_offset_bytes<'local>( + env: &mut JNIEnv<'local>, + jbytes: JByteArray, +) -> Result> { + let buf = env.convert_byte_array(&jbytes)?; + let bitmap = deserialize_row_offset_bytes(&buf)?; + let offsets = expand_row_offset_bytes_to_i64(&bitmap); + let arr = env.new_long_array(offsets.len() as i32)?; + if !offsets.is_empty() { + env.set_long_array_region(&arr, 0, &offsets)?; + } + Ok(arr) +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_fragment_FragmentUpdateResult_encodeRowOffsetsToBytes< + 'local, +>( + mut env: JNIEnv<'local>, + _cls: JClass, + joffsets: JLongArray, +) -> JByteArray<'local> { + ok_or_throw_with_return!( + env, + inner_encode_updated_row_offset_bytes(&mut env, joffsets), + unsafe { JByteArray::from_raw(std::ptr::null_mut()) } + ) +} + +fn inner_encode_updated_row_offset_bytes<'local>( + env: &mut JNIEnv<'local>, + joffsets: JLongArray, +) -> Result> { + let len = env.get_array_length(&joffsets)?; + let mut buf: Vec = vec![0; len as usize]; + if len > 0 { + env.get_long_array_region(&joffsets, 0, buf.as_mut_slice())?; + } + let mut bitmap = RoaringBitmap::new(); + for offset in buf { + if offset < 0 { + return Err(Error::input_error(format!( + "updatedRowOffsets must be non-negative, got {offset}" + ))); + } + bitmap.insert(offset as u32); + } + let bytes = serialize_matched_offsets(&bitmap)?; + Ok(env.byte_array_from_slice(&bytes)?) +} + #[unsafe(no_mangle)] pub extern "system" fn Java_org_lance_fragment_RowIdMeta_nativeEncodeRowIds( mut env: JNIEnv, @@ -591,7 +684,7 @@ const FRAGMENT_MERGE_RESULT_CLASS: &str = "org/lance/fragment/FragmentMergeResul const FRAGMENT_MERGE_RESULT_CONSTRUCTOR_SIG: &str = "(Lorg/lance/FragmentMetadata;Lorg/lance/schema/LanceSchema;)V"; const FRAGMENT_UPDATE_RESULT_CLASS: &str = "org/lance/fragment/FragmentUpdateResult"; -const FRAGMENT_UPDATE_RESULT_CONSTRUCTOR_SIG: &str = "(Lorg/lance/FragmentMetadata;[J[J)V"; +const FRAGMENT_UPDATE_RESULT_CONSTRUCTOR_SIG: &str = "(Lorg/lance/FragmentMetadata;[J[B)V"; impl IntoJava for &FragmentMergeResult { fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> Result> { @@ -612,14 +705,15 @@ impl IntoJava for &FragmentUpdateResult { fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> Result> { let java_updated_fragment = self.updated_fragment.into_java(env)?; let java_fields_modified = JLance(self.fields_modified.clone()).into_java(env)?; - let java_updated_row_offsets = JLance(self.updated_row_offsets.clone()).into_java(env)?; + let java_updated_row_offset_bytes = + env.byte_array_from_slice(&self.updated_row_offset_bytes)?; Ok(env.new_object( FRAGMENT_UPDATE_RESULT_CLASS, FRAGMENT_UPDATE_RESULT_CONSTRUCTOR_SIG, &[ JValueGen::Object(&java_updated_fragment), JValueGen::Object(&java_fields_modified), - JValueGen::Object(&java_updated_row_offsets), + JValueGen::Object(&java_updated_row_offset_bytes), ], )?) } diff --git a/java/src/main/java/org/lance/fragment/FragmentUpdateResult.java b/java/src/main/java/org/lance/fragment/FragmentUpdateResult.java index cadb22b0e6a..0424c9e1507 100644 --- a/java/src/main/java/org/lance/fragment/FragmentUpdateResult.java +++ b/java/src/main/java/org/lance/fragment/FragmentUpdateResult.java @@ -26,19 +26,37 @@ public class FragmentUpdateResult { private final FragmentMetadata updatedFragment; private final long[] fieldsModified; - /** Local physical row offsets within the fragment that received updates (see RowAddress). */ - private final long[] updatedRowOffsets; + /** + * Matched physical row offsets within the fragment, serialized as portable RoaringBitmap bytes + * (little-endian, same format as {@link org.lance.operation.Update#updatedFragmentOffsets()}). + */ + private final byte[] updatedRowOffsetBytes; /** Two-argument form for callers that do not track per-row offsets; offsets default to empty. */ public FragmentUpdateResult(FragmentMetadata updatedFragment, long[] updatedFieldIds) { - this(updatedFragment, updatedFieldIds, new long[0]); + this(updatedFragment, updatedFieldIds, new byte[0]); } public FragmentUpdateResult( - FragmentMetadata updatedFragment, long[] updatedFieldIds, long[] updatedRowOffsets) { + FragmentMetadata updatedFragment, long[] updatedFieldIds, byte[] updatedRowOffsetBytes) { this.updatedFragment = updatedFragment; this.fieldsModified = updatedFieldIds; - this.updatedRowOffsets = updatedRowOffsets; + this.updatedRowOffsetBytes = + updatedRowOffsetBytes != null ? updatedRowOffsetBytes : new byte[0]; + } + + /** + * @deprecated Use {@link #getUpdatedRowOffsetBytes()} instead. This method expands serialized + * RoaringBitmap bytes to a {@code long[]} via JNI and is retained for backward compatibility + * with callers compiled against the #6650 API. + */ + @Deprecated + public FragmentUpdateResult( + FragmentMetadata updatedFragment, long[] updatedFieldIds, long[] updatedRowOffsets) { + this( + updatedFragment, + updatedFieldIds, + encodeRowOffsetsToBytes(updatedRowOffsets != null ? updatedRowOffsets : new long[0])); } public FragmentMetadata getUpdatedFragment() { @@ -49,17 +67,34 @@ public long[] getFieldsModified() { return fieldsModified; } - /** Physical row offsets (0-based within the fragment) whose columns were rewritten. */ + /** + * Physical row offsets (0-based within the fragment) whose columns were rewritten, as portable + * RoaringBitmap bytes. + */ + public byte[] getUpdatedRowOffsetBytes() { + return updatedRowOffsetBytes; + } + + /** + * Physical row offsets (0-based within the fragment) whose columns were rewritten. + * + * @deprecated Use {@link #getUpdatedRowOffsetBytes()} instead. + */ + @Deprecated public long[] getUpdatedRowOffsets() { - return updatedRowOffsets; + return expandRowOffsetsFromBytes(updatedRowOffsetBytes); } + private static native byte[] encodeRowOffsetsToBytes(long[] rowOffsets); + + private static native long[] expandRowOffsetsFromBytes(byte[] rowOffsetBytes); + @Override public String toString() { return MoreObjects.toStringHelper(this) .add("fragmentMetadata", updatedFragment) .add("updatedFieldIds", fieldsModified) - .add("updatedRowOffsets", updatedRowOffsets) + .add("updatedRowOffsetBytesLength", updatedRowOffsetBytes.length) .toString(); } } diff --git a/java/src/test/java/org/lance/fragment/FragmentUpdateResultTest.java b/java/src/test/java/org/lance/fragment/FragmentUpdateResultTest.java new file mode 100644 index 00000000000..fe62370a9cd --- /dev/null +++ b/java/src/test/java/org/lance/fragment/FragmentUpdateResultTest.java @@ -0,0 +1,106 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.lance.fragment; + +import org.lance.CommitBuilder; +import org.lance.Dataset; +import org.lance.Fragment; +import org.lance.FragmentMetadata; +import org.lance.TestUtils; +import org.lance.Transaction; +import org.lance.operation.Append; + +import org.apache.arrow.memory.RootAllocator; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.nio.file.Path; +import java.util.Collections; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class FragmentUpdateResultTest { + + /** Portable RoaringBitmap bytes for offsets {1, 3, 5} (see UpdateTest round-trip fixture). */ + private static final byte[] PORTABLE_ROARING_BYTES_135 = + new byte[] { + (byte) 0x3A, + (byte) 0x30, + (byte) 0x00, + (byte) 0x00, + (byte) 0x01, + (byte) 0x00, + (byte) 0x00, + (byte) 0x00, + (byte) 0x00, + (byte) 0x00, + (byte) 0x02, + (byte) 0x00, + (byte) 0x10, + (byte) 0x00, + (byte) 0x00, + (byte) 0x00, + (byte) 0x01, + (byte) 0x00, + (byte) 0x03, + (byte) 0x00, + (byte) 0x05, + (byte) 0x00 + }; + + @Test + void testGetUpdatedRowOffsetBytesRoundTripViaDeprecatedGetter() { + FragmentUpdateResult result = + new FragmentUpdateResult(null, new long[0], PORTABLE_ROARING_BYTES_135); + assertArrayEquals(PORTABLE_ROARING_BYTES_135, result.getUpdatedRowOffsetBytes()); + assertArrayEquals(new long[] {1, 3, 5}, result.getUpdatedRowOffsets()); + } + + @Test + void testDeprecatedLongArrayConstructorEncodesToBytes() { + FragmentUpdateResult result = new FragmentUpdateResult(null, new long[0], new long[] {1, 3, 5}); + assertArrayEquals(new long[] {1, 3, 5}, result.getUpdatedRowOffsets()); + + // Stored bytes from the deprecated long[] constructor decode to the same offsets. + FragmentUpdateResult fromEncodedBytes = + new FragmentUpdateResult(null, new long[0], result.getUpdatedRowOffsetBytes()); + assertArrayEquals(new long[] {1, 3, 5}, fromEncodedBytes.getUpdatedRowOffsets()); + } + + @Test + void testUpdateColumnsReturnsMatchedRowOffsetBytes(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("testUpdateColumnsRowOffsetBytes").toString(); + try (RootAllocator allocator = new RootAllocator(Long.MAX_VALUE)) { + TestUtils.UpdateColumnTestDataset testDataset = + new TestUtils.UpdateColumnTestDataset(allocator, datasetPath); + try (Dataset dataset = testDataset.createEmptyDataset()) { + FragmentMetadata fragmentMeta = testDataset.createNewFragment(6); + try (Transaction appendTxn = + new Transaction.Builder() + .readVersion(dataset.version()) + .operation( + Append.builder().fragments(Collections.singletonList(fragmentMeta)).build()) + .build()) { + try (Dataset appended = new CommitBuilder(dataset).execute(appendTxn)) { + Fragment fragment = appended.getFragments().get(0); + FragmentUpdateResult updateResult = testDataset.updateColumn(fragment, 4); + assertTrue(updateResult.getUpdatedRowOffsetBytes().length > 0); + assertArrayEquals(new long[] {0, 1, 2, 3}, updateResult.getUpdatedRowOffsets()); + } + } + } + } + } +}