From 4425bdd9012dc2b91256cfc968b47d2cb9462e68 Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 9 Jun 2026 09:18:27 -0400 Subject: [PATCH 1/2] slice REE values along with run_ends when ree.slice() is called --- arrow-array/src/array/run_array.rs | 160 +++++++++++++++++++++++++++-- arrow-string/src/length.rs | 33 ++++++ 2 files changed, 184 insertions(+), 9 deletions(-) diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 09fb2998a22b..33d53e9ec4f0 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -341,14 +341,43 @@ impl RunArray { /// Returns a zero-copy slice of this array with the indicated offset and length. /// + /// The returned slice has its `values` and physical `run_ends` trimmed to only + /// the entries covering the requested logical range. + /// /// # Panics /// /// - Specified slice (`offset` + `length`) exceeds existing length pub fn slice(&self, offset: usize, length: usize) -> Self { + let run_ends = self.run_ends.slice(offset, length); + let (run_ends, values) = if length == 0 { + let inner = ScalarBuffer::from(vec![]); + let run_ends = unsafe { RunEndBuffer::new_unchecked(inner, 0, 0) }; + (run_ends, self.values.slice(0, 0)) + } else { + let start = run_ends.get_start_physical_index(); + let end = run_ends.get_end_physical_index(); + let values = self.values.slice(start, end - start + 1); + let logical_offset = run_ends.offset(); + let logical_length = run_ends.len(); + // Trim to the physical entries covering this slice and normalize the run_end + // values: subtract the logical offset and cap at the logical length. + // After this, run_ends[i] is the cumulative element count through run i, + // measured from position 0 of this slice, with no hidden offset context. + let adjusted: ScalarBuffer = run_ends.inner()[start..=end] + .iter() + .map(|v| { + R::Native::from_usize((v.as_usize() - logical_offset).min(logical_length)) + .unwrap() + }) + .collect::>() + .into(); + let run_ends = unsafe { RunEndBuffer::new_unchecked(adjusted, 0, run_ends.len()) }; + (run_ends, values) + }; Self { data_type: self.data_type.clone(), - run_ends: self.run_ends.slice(offset, length), - values: self.values.clone(), + run_ends, + values, } } } @@ -462,6 +491,9 @@ unsafe impl Array for RunArray { fn logical_nulls(&self) -> Option { let len = self.len(); + if len == 0 { + return None; + } let nulls = self.values.logical_nulls()?; let mut out = BooleanBufferBuilder::new(len); let offset = self.run_ends.offset(); @@ -1220,7 +1252,6 @@ mod tests { PrimitiveRunBuilder::::with_capacity(input_array.len()); builder.extend(input_array.iter().copied()); let run_array = builder.finish(); - let physical_values_array = run_array.values().as_primitive::(); // test for all slice lengths. for slice_len in 1..=total_len { @@ -1248,7 +1279,7 @@ mod tests { &logical_indices, sliced_input_array, &physical_indices, - physical_values_array, + sliced_run_array.values().as_primitive::(), ); // test for offset = total_len - slice_len and slice length = slice_len @@ -1270,7 +1301,7 @@ mod tests { &logical_indices, sliced_input_array, &physical_indices, - physical_values_array, + sliced_run_array.values().as_primitive::(), ); } } @@ -1291,8 +1322,12 @@ mod tests { let slices = [(0, 12), (0, 2), (2, 5), (3, 0), (3, 3), (3, 4), (4, 8)]; for (offset, length) in slices { let a = array.slice(offset, length); - let n = a.logical_nulls().unwrap(); - let n = n.into_iter().collect::>(); + // After trimming values in slice(), some sub-slices may have no nulls in their + // trimmed values, so logical_nulls() returns None (meaning all valid). + let n = match a.logical_nulls() { + Some(n) => n.into_iter().collect::>(), + None => vec![true; length], + }; assert_eq!(&n, &expected[offset..offset + length], "{offset} {length}"); } } @@ -1385,8 +1420,9 @@ mod tests { let slice2 = array.slice(2, 3); // 1, 1, 1 // logical indices: 2, 3, 4 // physical indices: 1, 1, 1 - assert_eq!(slice2.get_start_physical_index(), 1); - assert_eq!(slice2.get_end_physical_index(), 1); + // After values trimming, physical indices are 0-based relative to the trimmed values. + assert_eq!(slice2.get_start_physical_index(), 0); + assert_eq!(slice2.get_end_physical_index(), 0); let values_slice2 = slice2.values_slice(); let values_slice2 = values_slice2.as_primitive::(); @@ -1508,6 +1544,112 @@ mod tests { }; } + mod slice_trims_values { + use super::*; + + fn make_array() -> RunArray { + let run_ends = Int32Array::from(vec![2, 5, 8]); + let values = StringArray::from(vec!["a", "b", "c"]); + // Logical: [a, a, b, b, b, c, c, c] + RunArray::::try_new(&run_ends, &values).unwrap() + } + + fn physical_values(array: &RunArray) -> Vec<&str> { + array.values().as_string::().iter().flatten().collect() + } + + fn logical_values(array: &RunArray) -> Vec<&str> { + array + .downcast::() + .unwrap() + .into_iter() + .flatten() + .collect() + } + + #[test] + fn test_slice_trims_values() { + // slice(2,4) → ["b","b","b","c"] + let sliced = make_array().slice(2, 4); + assert_eq!(physical_values(&sliced), vec!["b", "c"]); + assert_eq!(sliced.run_ends().values(), &[3i32, 4]); + } + + #[test] + fn test_slice_entirely_within_one_run() { + //slice(3,2) → ["b","b"] + let sliced = make_array().slice(3, 2); + assert_eq!(physical_values(&sliced), vec!["b"]); + assert_eq!(sliced.run_ends().values(), &[2i32]); + } + + #[test] + fn test_slice_crosses_multiple_run_boundaries() { + //slice(1,6) → ["a","b","b","b","c","c"] + let sliced = make_array().slice(1, 6); + assert_eq!(physical_values(&sliced), vec!["a", "b", "c"]); + assert_eq!(sliced.run_ends().values(), &[1i32, 4, 6]); + } + + #[test] + fn test_double_slice_preserves_normalization() { + //slice(2,6) → ["b","b","b","c","c","c"] + let first = make_array().slice(2, 6); + assert_eq!(physical_values(&first), vec!["b", "c"]); + assert_eq!(first.run_ends().values(), &[3i32, 6]); + + // slice(1,4) → ["b","b","c","c"] + let second = first.slice(1, 4); + assert_eq!(physical_values(&second), vec!["b", "c"]); + assert_eq!(second.run_ends().values(), &[2i32, 4]); + } + + #[test] + fn test_repeated_slice_normalization() { + let run_ends = Int32Array::from(vec![4, 8, 12, 16, 20]); + let values = StringArray::from(vec!["a", "b", "c", "d", "e"]); + let base = RunArray::::try_new(&run_ends, &values).unwrap(); + + // ["a","a","a","a","b","b","b","b","c","c","c","c","d","d","d","d","e","e","e","e"] + let l1 = base.slice(3, 14); + // ["a","b","b","b","b","c","c","c","c","d","d","d","d","e"] + assert_eq!(l1.run_ends().values(), &[1i32, 5, 9, 13, 14]); + assert_eq!( + logical_values(&l1), + vec![ + "a", "b", "b", "b", "b", "c", "c", "c", "c", "d", "d", "d", "d", "e" + ] + ); + + // ["a","b","b","b","b","c","c","c","c","d","d","d","d","e"] + let l2 = l1.slice(1, 10); + // ["b","b","b","b","c","c","c","c","d","d"] + assert_eq!(l2.run_ends().values(), &[4i32, 8, 10]); + assert_eq!( + logical_values(&l2), + vec!["b", "b", "b", "b", "c", "c", "c", "c", "d", "d"] + ); + + // ["b","b","b","b","c","c","c","c","d","d"] + let l3 = l2.slice(0, 6); + // ["b","b","b","b","c","c"] + assert_eq!(l3.run_ends().values(), &[4i32, 6]); + assert_eq!(logical_values(&l3), vec!["b", "b", "b", "b", "c", "c"]); + + // ["b","b","b","b","c","c"] + let l4 = l3.slice(3, 3); + // ["b","c","c"] + assert_eq!(l4.run_ends().values(), &[1i32, 3]); + assert_eq!(logical_values(&l4), vec!["b", "c", "c"]); + + // ["b","c","c"] + let l5 = l4.slice(1, 2); + // ["c","c"] + assert_eq!(l5.run_ends().values(), &[2i32]); + assert_eq!(logical_values(&l5), vec!["c", "c"]); + } + } + #[test] fn test_run_array_roundtrip() { let run = Int32Array::from(vec![3, 6, 9, 12]); diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index 99a9bd69a6c3..acf80d9d6999 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -911,4 +911,37 @@ mod tests { let expected: Int32Array = vec![40, 40, 32].into(); assert_eq!(&expected, result_values); } + + #[test] + fn length_test_ree_sliced_does_less_work() { + use arrow_array::RunArray; + use arrow_array::types::Int32Type; + + // ["hello","hello","hello","world","world","world","foo","foo","foo","bar","bar","bar","baz","baz","baz"] + let values = StringArray::from(vec!["hello", "world", "foo", "bar", "baz"]); + let run_ends = PrimitiveArray::::from(vec![3i32, 6, 9, 12, 15]); + let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); + assert_eq!(ree_array.values().len(), 5); + + // ["foo","foo","foo","bar","bar","bar"] + let sliced = ree_array.slice(6, 6); + assert_eq!(sliced.values().len(), 2); + + // length() only runs on ["foo","bar"] — not all 5 original physical values + let result_arr = length(&sliced).unwrap(); + let result_ree = result_arr + .as_any() + .downcast_ref::>() + .unwrap(); + + assert_eq!( + result_ree + .values() + .as_any() + .downcast_ref::() + .unwrap(), + &Int32Array::from(vec![3, 3]) + ); + assert_eq!(result_ree.run_ends().values(), &[3i32, 6]); + } } From 5d0b825fac86daf47df9acdd6b0a078361030fbc Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 9 Jun 2026 20:43:01 -0400 Subject: [PATCH 2/2] add null test case & avoid .unwrap call --- arrow-array/src/array/run_array.rs | 48 +++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 33d53e9ec4f0..3850ded7e1fb 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -363,14 +363,22 @@ impl RunArray { // values: subtract the logical offset and cap at the logical length. // After this, run_ends[i] is the cumulative element count through run i, // measured from position 0 of this slice, with no hidden offset context. - let adjusted: ScalarBuffer = run_ends.inner()[start..=end] - .iter() - .map(|v| { - R::Native::from_usize((v.as_usize() - logical_offset).min(logical_length)) - .unwrap() - }) - .collect::>() - .into(); + let adjusted: ScalarBuffer = + if logical_offset == 0 && run_ends.inner()[end].as_usize() == logical_length { + // Already zero-based and last run end is exactly at the slice boundary, + // so no value transformation is needed — slice the buffer directly. + run_ends.inner().slice(start, end - start + 1) + } else { + run_ends.inner()[start..=end] + .iter() + .map(|v| { + R::Native::from_usize( + (v.as_usize() - logical_offset).min(logical_length), + ) + .expect("logical length fits in run-end type") + }) + .collect() + }; let run_ends = unsafe { RunEndBuffer::new_unchecked(adjusted, 0, run_ends.len()) }; (run_ends, values) }; @@ -1604,6 +1612,30 @@ mod tests { assert_eq!(second.run_ends().values(), &[2i32, 4]); } + #[test] + fn test_slice_with_nulls_preserves_logical_nulls() { + // Logical: [null, null, "a", "a", null, null, "b", "b"] + // Physical values: [null, "a", null, "b"], run_ends: [2, 4, 6, 8] + let run_ends = Int32Array::from(vec![2, 4, 6, 8]); + let values = StringArray::from(vec![None, Some("a"), None, Some("b")]); + let array = RunArray::::try_new(&run_ends, &values).unwrap(); + + // slice(1, 6) → [null, "a", "a", null, null, "b"] + let sliced = array.slice(1, 6); + assert_eq!(physical_values(&sliced), vec!["a", "b"]); // nulls excluded by flatten + let nulls = sliced.logical_nulls().unwrap(); + assert_eq!(nulls.null_count(), 3); + let actual: Vec = nulls.into_iter().collect(); + assert_eq!(actual, vec![false, true, true, false, false, true]); + + // slice(2, 4) → ["a", "a", null, null] + let sliced2 = array.slice(2, 4); + let nulls2 = sliced2.logical_nulls().unwrap(); + assert_eq!(nulls2.null_count(), 2); + let actual2: Vec = nulls2.into_iter().collect(); + assert_eq!(actual2, vec![true, true, false, false]); + } + #[test] fn test_repeated_slice_normalization() { let run_ends = Int32Array::from(vec![4, 8, 12, 16, 20]);