From 3b371acf5a18dc571277219fde0b255f5eff1473 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 15 Apr 2025 15:51:52 +0000 Subject: [PATCH 01/26] Add int96 stats test --- parquet/tests/int96_stats.rs | 80 ++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 parquet/tests/int96_stats.rs diff --git a/parquet/tests/int96_stats.rs b/parquet/tests/int96_stats.rs new file mode 100644 index 000000000000..8e90fc260744 --- /dev/null +++ b/parquet/tests/int96_stats.rs @@ -0,0 +1,80 @@ +use parquet::basic::Type; +use parquet::data_type::{Int96, Int96Type}; +use parquet::file::properties::{EnabledStatistics, WriterProperties}; +use parquet::file::reader::{FileReader, SerializedFileReader}; +use parquet::file::statistics::Statistics; +use parquet::file::writer::SerializedFileWriter; +use parquet::schema::parser::parse_message_type; +use std::fs::File; +use std::sync::Arc; +use tempfile::Builder; + +#[test] +fn test_int96_stats() { + // Create a temporary file + let tmp = Builder::new() + .prefix("test_int96_stats") + .tempfile() + .unwrap(); + let file_path = tmp.path().to_owned(); + + // Create schema with INT96 field + let message_type = " + message test { + REQUIRED INT96 timestamp; + } + "; + let schema = parse_message_type(message_type).unwrap(); + + // Configure writer properties to enable statistics + let props = WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .build(); + + // Create writer and write data + { + let file = File::create(&file_path).unwrap(); + let mut writer = SerializedFileWriter::new(file, schema.into(), Arc::new(props)).unwrap(); + let mut row_group = writer.next_row_group().unwrap(); + let mut col_writer = row_group.next_column().unwrap().unwrap(); + + // Create INT96 data + let data = vec![ + Int96::from(vec![1, 0, 0]), // min value + Int96::from(vec![2, 0, 0]), + Int96::from(vec![3, 0, 0]), + Int96::from(vec![4, 0, 0]), // max value + ]; + + // Write the data + { + let writer = col_writer.typed::(); + writer.write_batch(&data, None, None).unwrap(); + } + col_writer.close().unwrap(); + row_group.close().unwrap(); + writer.close().unwrap(); + } + + // Read the file back + let file = File::open(&file_path).unwrap(); + let reader = SerializedFileReader::new(file).unwrap(); + let metadata = reader.metadata(); + let row_group = metadata.row_group(0); + let column = row_group.column(0); + + // Get the statistics + let stats = column.statistics().unwrap(); + assert_eq!(stats.physical_type(), Type::INT96); + + if let Statistics::Int96(stats) = stats { + let min = stats.min_opt().unwrap(); + let max = stats.max_opt().unwrap(); + + // Verify the statistics + assert!(min < max, "min value should be less than max value"); + assert_eq!(stats.null_count_opt(), Some(0)); + } else { + panic!("Expected Int96 statistics"); + } +} \ No newline at end of file From fd51210e10e55aee133b352f2be860ff5b2f9bec Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 15 Apr 2025 16:10:12 +0000 Subject: [PATCH 02/26] fix conversions --- parquet/tests/int96_stats.rs | 79 +++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 5 deletions(-) diff --git a/parquet/tests/int96_stats.rs b/parquet/tests/int96_stats.rs index 8e90fc260744..cee783d24ac6 100644 --- a/parquet/tests/int96_stats.rs +++ b/parquet/tests/int96_stats.rs @@ -8,6 +8,69 @@ use parquet::schema::parser::parse_message_type; use std::fs::File; use std::sync::Arc; use tempfile::Builder; +use chrono::{DateTime, NaiveDateTime, Utc}; + +const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588; +const NANOSECONDS_IN_DAY: i64 = 86_400 * 1_000_000_000; + +fn datetime_to_int96(dt: &str) -> Int96 { + let naive = NaiveDateTime::parse_from_str(dt, "%Y-%m-%d %H:%M:%S%.f").unwrap(); + let datetime: DateTime = DateTime::from_naive_utc_and_offset(naive, Utc); + let nanos = datetime.timestamp_nanos_opt().unwrap(); + + // Convert to INT96 format + let days = nanos / NANOSECONDS_IN_DAY; + let julian_day = (days + JULIAN_DAY_OF_EPOCH) as u32; + + let remaining_nanos = nanos % NANOSECONDS_IN_DAY; + let nanos_low = (remaining_nanos & 0xFFFFFFFF) as u32; + let nanos_high = ((remaining_nanos >> 32) & 0xFFFFFFFF) as u32; + + let mut int96 = Int96::new(); + // The order of components is: + // data[0] = low 32 bits of nanoseconds + // data[1] = high 32 bits of nanoseconds + // data[2] = Julian day + int96.set_data(nanos_low, nanos_high, julian_day); + int96 +} + +#[test] +fn test_int96_conversion() { + let test_timestamps = vec![ + "2020-01-01 00:00:00.000", + "2020-02-29 23:59:59.999", + "2020-12-31 23:59:59.999", + "2021-01-01 00:00:00.000", + "2023-06-15 12:30:45.500", + "2024-02-29 15:45:30.750", + "2024-12-25 07:00:00.000", + "2025-01-01 00:00:00.000", + "2025-07-04 20:00:00.000", + "2025-12-31 23:59:59.999", + ]; + + for dt in test_timestamps { + let naive = NaiveDateTime::parse_from_str(dt, "%Y-%m-%d %H:%M:%S%.f").unwrap(); + let datetime: DateTime = DateTime::from_naive_utc_and_offset(naive, Utc); + let nanos = datetime.timestamp_nanos_opt().unwrap(); + let expected_seconds = nanos / 1_000_000_000; + + let int96 = datetime_to_int96(dt); + let int96_seconds = int96.to_seconds(); + + println!("Timestamp: {}", dt); + println!(" Original nanos: {}", nanos); + println!(" Expected seconds: {}", expected_seconds); + println!(" INT96 components: {:?}", int96.data()); + println!(" INT96 seconds: {}", int96_seconds); + println!(" Days since epoch: {}", nanos / NANOSECONDS_IN_DAY); + println!(" Julian day: {}", (nanos / NANOSECONDS_IN_DAY) + JULIAN_DAY_OF_EPOCH); + println!(" Nanoseconds in day: {}", nanos % NANOSECONDS_IN_DAY); + + assert_eq!(expected_seconds, int96_seconds, "Seconds conversion mismatch for timestamp: {}", dt); + } +} #[test] fn test_int96_stats() { @@ -38,12 +101,18 @@ fn test_int96_stats() { let mut row_group = writer.next_row_group().unwrap(); let mut col_writer = row_group.next_column().unwrap().unwrap(); - // Create INT96 data + // Create INT96 data from timestamps let data = vec![ - Int96::from(vec![1, 0, 0]), // min value - Int96::from(vec![2, 0, 0]), - Int96::from(vec![3, 0, 0]), - Int96::from(vec![4, 0, 0]), // max value + datetime_to_int96("2020-01-01 00:00:00.000"), // New Year 2020 + datetime_to_int96("2020-02-29 23:59:59.999"), // Leap day 2020 + datetime_to_int96("2020-12-31 23:59:59.999"), // End of 2020 + datetime_to_int96("2021-01-01 00:00:00.000"), // Start of 2021 + datetime_to_int96("2023-06-15 12:30:45.500"), // Mid-2023 + datetime_to_int96("2024-02-29 15:45:30.750"), // Leap day 2024 + datetime_to_int96("2024-12-25 07:00:00.000"), // Christmas 2024 + datetime_to_int96("2025-01-01 00:00:00.000"), // New Year 2025 + datetime_to_int96("2025-07-04 20:00:00.000"), // July 4th 2025 + datetime_to_int96("2025-12-31 23:59:59.999"), // End of 2025 ]; // Write the data From 72687dd89aec8525a9306d718b23a9a764caa09d Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 15 Apr 2025 16:17:00 +0000 Subject: [PATCH 03/26] asserts --- parquet/tests/int96_stats.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/parquet/tests/int96_stats.rs b/parquet/tests/int96_stats.rs index cee783d24ac6..17e43c718bee 100644 --- a/parquet/tests/int96_stats.rs +++ b/parquet/tests/int96_stats.rs @@ -95,6 +95,11 @@ fn test_int96_stats() { .build(); // Create writer and write data + let first_timestamp = "2020-01-01 00:00:00.000"; // First timestamp + let last_timestamp = "2025-12-31 23:59:59.999"; // Last timestamp + let expected_min = datetime_to_int96(first_timestamp); + let expected_max = datetime_to_int96(last_timestamp); + { let file = File::create(&file_path).unwrap(); let mut writer = SerializedFileWriter::new(file, schema.into(), Arc::new(props)).unwrap(); @@ -141,8 +146,14 @@ fn test_int96_stats() { let max = stats.max_opt().unwrap(); // Verify the statistics - assert!(min < max, "min value should be less than max value"); + println!("Min timestamp ({}): {:?}", first_timestamp, min.data()); + println!("Max timestamp ({}): {:?}", last_timestamp, max.data()); + assert_eq!(*min, expected_min, "Min value should be {}", first_timestamp); + assert_eq!(*max, expected_max, "Max value should be {}", last_timestamp); assert_eq!(stats.null_count_opt(), Some(0)); + + println!("Min timestamp ({}): {:?}", first_timestamp, min.data()); + println!("Max timestamp ({}): {:?}", last_timestamp, max.data()); } else { panic!("Expected Int96 statistics"); } From 46d98e85069309ff4fb1c2ad011222f2ee924cd6 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 15 Apr 2025 16:23:38 +0000 Subject: [PATCH 04/26] printing change --- parquet/tests/int96_stats.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/tests/int96_stats.rs b/parquet/tests/int96_stats.rs index 17e43c718bee..3e9aaf52dfd3 100644 --- a/parquet/tests/int96_stats.rs +++ b/parquet/tests/int96_stats.rs @@ -146,7 +146,7 @@ fn test_int96_stats() { let max = stats.max_opt().unwrap(); // Verify the statistics - println!("Min timestamp ({}): {:?}", first_timestamp, min.data()); + println!("Min timestamp ({}): {:?}", first_timestamp, min.data()); println!("Max timestamp ({}): {:?}", last_timestamp, max.data()); assert_eq!(*min, expected_min, "Min value should be {}", first_timestamp); assert_eq!(*max, expected_max, "Max value should be {}", last_timestamp); From 93a780c4b63a964197fd5d887e8fe8c78ccf1bda Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 17 Apr 2025 13:01:05 +0000 Subject: [PATCH 05/26] Create int96 from time since epoch --- parquet/src/data_type.rs | 60 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 79ecbea45ebe..cb36dbcd6d6e 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -124,6 +124,43 @@ impl Int96 { .wrapping_add(nanos) } + /// Sets the INT96 data from seconds since epoch + /// + /// Will wrap around on overflow + #[inline] + pub fn set_data_from_seconds(&mut self, seconds: i64) { + self.set_data_from_nanos(seconds.wrapping_mul(NANOSECONDS)); + } + + /// Sets the INT96 data from milliseconds since epoch + /// + /// Will wrap around on overflow + #[inline] + pub fn set_data_from_millis(&mut self, millis: i64) { + self.set_data_from_nanos(millis.wrapping_mul(MICROSECONDS)); + } + + /// Sets the INT96 data from microseconds since epoch + /// + /// Will wrap around on overflow + #[inline] + pub fn set_data_from_micros(&mut self, micros: i64) { + self.set_data_from_nanos(micros.wrapping_mul(MILLISECONDS)); + } + + /// Sets the INT96 data from nanoseconds since epoch + /// + /// Will wrap around on overflow + #[inline] + pub fn set_data_from_nanos(&mut self, nanos: i64) { + let days = nanos / NANOSECONDS_IN_DAY; + let remaining_nanos = nanos % NANOSECONDS_IN_DAY; + let julian_day = (days + JULIAN_DAY_OF_EPOCH) as u32; + let nanos_low = (remaining_nanos & 0xFFFFFFFF) as u32; + let nanos_high = ((remaining_nanos >> 32) & 0xFFFFFFFF) as u32; + self.set_data(nanos_low, nanos_high, julian_day); + } + #[inline] fn data_as_days_and_nanos(&self) -> (i32, i64) { let day = self.data()[2] as i32; @@ -1409,4 +1446,27 @@ mod tests { assert_eq!(ba1, ba11); assert!(ba5 > ba1); } + + #[test] + fn test_int96_time_conversions() { + let test_values = [ + 0, 1, 60, 3600, 86400, 1234567, 31536000, + ]; + + for &value in &test_values { + let mut i96 = Int96::new(); + + i96.set_data_from_seconds(value); + assert_eq!(i96.to_seconds(), value, "seconds roundtrip failed for {}", value); + + i96.set_data_from_seconds(value); + assert_eq!(i96.to_millis(), value, "millis roundtrip failed for {}", value); + + i96.set_data_from_seconds(value); + assert_eq!(i96.to_micros(), value, "micros roundtrip failed for {}", value); + + i96.set_data_from_seconds(value); + assert_eq!(i96.to_nanos(), value, "nanos roundtrip failed for {}", value); + } + } } From a5b9eb7264e7da29daf11308a70a3ecb93e9e1b6 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 17 Apr 2025 13:15:22 +0000 Subject: [PATCH 06/26] Add ways to set int96 from timestamps --- parquet/src/data_type.rs | 44 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index cb36dbcd6d6e..052cfc8f8ab4 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -161,6 +161,19 @@ impl Int96 { self.set_data(nanos_low, nanos_high, julian_day); } + /// Sets the INT96 data directly from days and nanoseconds + /// + /// This is the most direct way to set the Int96 data structure which internally + /// stores days and nanoseconds. The days should be Julian days since epoch. + + #[inline] + pub fn set_data_from_days_and_nanos(&mut self, days: i32, nanos: i64) { + let julian_day = (days as i32) as u32; + let nanos_low = (nanos & 0xFFFFFFFF) as u32; + let nanos_high = ((nanos >> 32) & 0xFFFFFFFF) as u32; + self.set_data(nanos_low, nanos_high, julian_day); + } + #[inline] fn data_as_days_and_nanos(&self) -> (i32, i64) { let day = self.data()[2] as i32; @@ -1454,19 +1467,42 @@ mod tests { ]; for &value in &test_values { - let mut i96 = Int96::new(); + let mut i96: Int96 = Int96::new(); i96.set_data_from_seconds(value); assert_eq!(i96.to_seconds(), value, "seconds roundtrip failed for {}", value); - i96.set_data_from_seconds(value); + i96.set_data_from_millis(value); assert_eq!(i96.to_millis(), value, "millis roundtrip failed for {}", value); - i96.set_data_from_seconds(value); + i96.set_data_from_micros(value); assert_eq!(i96.to_micros(), value, "micros roundtrip failed for {}", value); - i96.set_data_from_seconds(value); + i96.set_data_from_nanos(value); assert_eq!(i96.to_nanos(), value, "nanos roundtrip failed for {}", value); + + let test_day_nanos = [ + (0, 0), // 1970-01-01 00:00:00.000000000 (Unix epoch) + (0, 1), // 1970-01-01 00:00:00.000000001 + (0, NANOSECONDS - 1), // 1970-01-01 00:00:00.999999999 + (0, NANOSECONDS), // 1970-01-01 00:00:01.000000000 + (1, 0), // 1970-01-02 00:00:00.000000000 + (1, NANOSECONDS), // 1970-01-02 00:00:01.000000000 + (365, 0), // 1971-01-01 00:00:00.000000000 (1 year after epoch) + (365, NANOSECONDS * 3600), // 1971-01-01 01:00:00.000000000 + (10957, 0), // 2000-01-01 00:00:00.000000000 (Y2K) + (18262, 0), // 2020-01-01 00:00:00.000000000 + (18262, NANOSECONDS * 3600 * 12), // 2020-01-01 12:00:00.000000000 + ]; + + for &(days, nanos) in &test_day_nanos { + let mut i96 = Int96::new(); + i96.set_data_from_days_and_nanos(days, nanos); + let (roundtrip_days, roundtrip_nanos) = i96.data_as_days_and_nanos(); + assert_eq!(roundtrip_days, days, "days roundtrip failed for days={}, nanos={}", days, nanos); + assert_eq!(roundtrip_nanos, nanos, "nanos roundtrip failed for days={}, nanos={}", days, nanos); + } } } + } From 796243bd792d0866fb18e87d64ae257460ca26d4 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 17 Apr 2025 13:19:13 +0000 Subject: [PATCH 07/26] simplify --- parquet/src/data_type.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 052cfc8f8ab4..52667a8cef95 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -155,10 +155,8 @@ impl Int96 { pub fn set_data_from_nanos(&mut self, nanos: i64) { let days = nanos / NANOSECONDS_IN_DAY; let remaining_nanos = nanos % NANOSECONDS_IN_DAY; - let julian_day = (days + JULIAN_DAY_OF_EPOCH) as u32; - let nanos_low = (remaining_nanos & 0xFFFFFFFF) as u32; - let nanos_high = ((remaining_nanos >> 32) & 0xFFFFFFFF) as u32; - self.set_data(nanos_low, nanos_high, julian_day); + let julian_day = (days + JULIAN_DAY_OF_EPOCH) as i32; + self.set_data_from_days_and_nanos(julian_day, remaining_nanos); } /// Sets the INT96 data directly from days and nanoseconds From 34c928d7f9146724ca699b1de4e4f9cde2a9d47d Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 17 Apr 2025 15:48:44 +0000 Subject: [PATCH 08/26] Add correct ordering for int96 --- parquet/src/data_type.rs | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 52667a8cef95..d2934cc3d72f 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -33,7 +33,7 @@ use crate::util::bit_util::FromBytes; /// Rust representation for logical type INT96, value is backed by an array of `u32`. /// The type only takes 12 bytes, without extra padding. -#[derive(Clone, Copy, Debug, PartialOrd, Default, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub struct Int96 { value: [u32; 3], } @@ -180,6 +180,24 @@ impl Int96 { } } +impl PartialOrd for Int96 { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Int96 { + fn cmp(&self, other: &Self) -> Ordering { + let (self_days, self_nanos) = self.data_as_days_and_nanos(); + let (other_days, other_nanos) = other.data_as_days_and_nanos(); + + match self_days.cmp(&other_days) { + Ordering::Equal => self_nanos.cmp(&other_nanos), + ord => ord, + } + } +} + impl From> for Int96 { fn from(buf: Vec) -> Self { assert_eq!(buf.len(), 3); From eb8a77c0e6aaeab997d9de8177e862b034aac928 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 17 Apr 2025 15:56:14 +0000 Subject: [PATCH 09/26] Add tests for int96ordering --- parquet/src/data_type.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index d2934cc3d72f..6b2098edd40a 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -1521,4 +1521,27 @@ mod tests { } } + #[test] + fn test_int96_ord() { + let test_pairs = [ + + ((99, 5), (100, 4)), + ((100, 10), (100, 5523)), + ((0, 0), (100, 0)), + ((10000, 1_000_000_000), (10000, 2_000_000_000)), + ((10000, 1_000_000_000), (20000, 1_000_000_000)), + ]; + + for (smaller, larger) in test_pairs { + let mut small = Int96::new(); + small.set_data_from_days_and_nanos(smaller.0, smaller.1); + let mut large = Int96::new(); + large.set_data_from_days_and_nanos(larger.0, larger.1); + + assert!(small < large, "Expected {:?} < {:?}", smaller, larger); + assert!(large > small, "Expected {:?} > {:?}", larger, smaller); + assert!(small == small, "Expected {:?} == {:?}", smaller, smaller); + assert!(large == large, "Expected {:?} == {:?}", larger, larger); + } + } } From ef07163a09e532fab90eb8b63074d0f5b9f885c6 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 17 Apr 2025 16:00:47 +0000 Subject: [PATCH 10/26] rename tests --- parquet/tests/{int96_stats.rs => int96_stats_roundtrip.rs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename parquet/tests/{int96_stats.rs => int96_stats_roundtrip.rs} (100%) diff --git a/parquet/tests/int96_stats.rs b/parquet/tests/int96_stats_roundtrip.rs similarity index 100% rename from parquet/tests/int96_stats.rs rename to parquet/tests/int96_stats_roundtrip.rs From 6abd75fd749524e06e3caec52daaca24ed4994cb Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 17 Apr 2025 16:08:18 +0000 Subject: [PATCH 11/26] Simplify test --- parquet/tests/int96_stats_roundtrip.rs | 100 +++++-------------------- 1 file changed, 20 insertions(+), 80 deletions(-) diff --git a/parquet/tests/int96_stats_roundtrip.rs b/parquet/tests/int96_stats_roundtrip.rs index 3e9aaf52dfd3..052b6ae3cf4c 100644 --- a/parquet/tests/int96_stats_roundtrip.rs +++ b/parquet/tests/int96_stats_roundtrip.rs @@ -10,68 +10,15 @@ use std::sync::Arc; use tempfile::Builder; use chrono::{DateTime, NaiveDateTime, Utc}; -const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588; -const NANOSECONDS_IN_DAY: i64 = 86_400 * 1_000_000_000; - fn datetime_to_int96(dt: &str) -> Int96 { let naive = NaiveDateTime::parse_from_str(dt, "%Y-%m-%d %H:%M:%S%.f").unwrap(); let datetime: DateTime = DateTime::from_naive_utc_and_offset(naive, Utc); let nanos = datetime.timestamp_nanos_opt().unwrap(); - - // Convert to INT96 format - let days = nanos / NANOSECONDS_IN_DAY; - let julian_day = (days + JULIAN_DAY_OF_EPOCH) as u32; - - let remaining_nanos = nanos % NANOSECONDS_IN_DAY; - let nanos_low = (remaining_nanos & 0xFFFFFFFF) as u32; - let nanos_high = ((remaining_nanos >> 32) & 0xFFFFFFFF) as u32; - let mut int96 = Int96::new(); - // The order of components is: - // data[0] = low 32 bits of nanoseconds - // data[1] = high 32 bits of nanoseconds - // data[2] = Julian day - int96.set_data(nanos_low, nanos_high, julian_day); + int96.set_data_from_nanos(nanos); int96 } -#[test] -fn test_int96_conversion() { - let test_timestamps = vec![ - "2020-01-01 00:00:00.000", - "2020-02-29 23:59:59.999", - "2020-12-31 23:59:59.999", - "2021-01-01 00:00:00.000", - "2023-06-15 12:30:45.500", - "2024-02-29 15:45:30.750", - "2024-12-25 07:00:00.000", - "2025-01-01 00:00:00.000", - "2025-07-04 20:00:00.000", - "2025-12-31 23:59:59.999", - ]; - - for dt in test_timestamps { - let naive = NaiveDateTime::parse_from_str(dt, "%Y-%m-%d %H:%M:%S%.f").unwrap(); - let datetime: DateTime = DateTime::from_naive_utc_and_offset(naive, Utc); - let nanos = datetime.timestamp_nanos_opt().unwrap(); - let expected_seconds = nanos / 1_000_000_000; - - let int96 = datetime_to_int96(dt); - let int96_seconds = int96.to_seconds(); - - println!("Timestamp: {}", dt); - println!(" Original nanos: {}", nanos); - println!(" Expected seconds: {}", expected_seconds); - println!(" INT96 components: {:?}", int96.data()); - println!(" INT96 seconds: {}", int96_seconds); - println!(" Days since epoch: {}", nanos / NANOSECONDS_IN_DAY); - println!(" Julian day: {}", (nanos / NANOSECONDS_IN_DAY) + JULIAN_DAY_OF_EPOCH); - println!(" Nanoseconds in day: {}", nanos % NANOSECONDS_IN_DAY); - - assert_eq!(expected_seconds, int96_seconds, "Seconds conversion mismatch for timestamp: {}", dt); - } -} - #[test] fn test_int96_stats() { // Create a temporary file @@ -94,32 +41,30 @@ fn test_int96_stats() { .set_statistics_enabled(EnabledStatistics::Page) .build(); + // Create INT96 data from timestamps + let data = vec![ + datetime_to_int96("2020-01-01 00:00:00.000"), // New Year 2020 + datetime_to_int96("2020-02-29 23:59:59.999"), // Leap day 2020 + datetime_to_int96("2020-12-31 23:59:59.999"), // End of 2020 + datetime_to_int96("2021-01-01 00:00:00.000"), // Start of 2021 + datetime_to_int96("2023-06-15 12:30:45.500"), // Mid-2023 + datetime_to_int96("2024-02-29 15:45:30.750"), // Leap day 2024 + datetime_to_int96("2024-12-25 07:00:00.000"), // Christmas 2024 + datetime_to_int96("2025-01-01 00:00:00.000"), // New Year 2025 + datetime_to_int96("2025-07-04 20:00:00.000"), // July 4th 2025 + datetime_to_int96("2025-12-31 23:59:59.999"), // End of 2025 + ]; + + let expected_min = data[0]; + let expected_max = data[data.len() - 1]; + // Create writer and write data - let first_timestamp = "2020-01-01 00:00:00.000"; // First timestamp - let last_timestamp = "2025-12-31 23:59:59.999"; // Last timestamp - let expected_min = datetime_to_int96(first_timestamp); - let expected_max = datetime_to_int96(last_timestamp); - { let file = File::create(&file_path).unwrap(); let mut writer = SerializedFileWriter::new(file, schema.into(), Arc::new(props)).unwrap(); let mut row_group = writer.next_row_group().unwrap(); let mut col_writer = row_group.next_column().unwrap().unwrap(); - // Create INT96 data from timestamps - let data = vec![ - datetime_to_int96("2020-01-01 00:00:00.000"), // New Year 2020 - datetime_to_int96("2020-02-29 23:59:59.999"), // Leap day 2020 - datetime_to_int96("2020-12-31 23:59:59.999"), // End of 2020 - datetime_to_int96("2021-01-01 00:00:00.000"), // Start of 2021 - datetime_to_int96("2023-06-15 12:30:45.500"), // Mid-2023 - datetime_to_int96("2024-02-29 15:45:30.750"), // Leap day 2024 - datetime_to_int96("2024-12-25 07:00:00.000"), // Christmas 2024 - datetime_to_int96("2025-01-01 00:00:00.000"), // New Year 2025 - datetime_to_int96("2025-07-04 20:00:00.000"), // July 4th 2025 - datetime_to_int96("2025-12-31 23:59:59.999"), // End of 2025 - ]; - // Write the data { let writer = col_writer.typed::(); @@ -146,14 +91,9 @@ fn test_int96_stats() { let max = stats.max_opt().unwrap(); // Verify the statistics - println!("Min timestamp ({}): {:?}", first_timestamp, min.data()); - println!("Max timestamp ({}): {:?}", last_timestamp, max.data()); - assert_eq!(*min, expected_min, "Min value should be {}", first_timestamp); - assert_eq!(*max, expected_max, "Max value should be {}", last_timestamp); + assert_eq!(*min, expected_min, "Min value should be {} but was {}", expected_min, min); + assert_eq!(*max, expected_max, "Max value should be {} but was {}", expected_max, max); assert_eq!(stats.null_count_opt(), Some(0)); - - println!("Min timestamp ({}): {:?}", first_timestamp, min.data()); - println!("Max timestamp ({}): {:?}", last_timestamp, max.data()); } else { panic!("Expected Int96 statistics"); } From 6108b144f4f5bbb8ad0206bb08337d1662b65935 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 17 Apr 2025 16:15:40 +0000 Subject: [PATCH 12/26] Refactor test --- parquet/tests/int96_stats_roundtrip.rs | 34 ++++++++++++++------------ 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/parquet/tests/int96_stats_roundtrip.rs b/parquet/tests/int96_stats_roundtrip.rs index 052b6ae3cf4c..3b5df1a02b48 100644 --- a/parquet/tests/int96_stats_roundtrip.rs +++ b/parquet/tests/int96_stats_roundtrip.rs @@ -19,8 +19,7 @@ fn datetime_to_int96(dt: &str) -> Int96 { int96 } -#[test] -fn test_int96_stats() { +fn verify_ordering(data: Vec) { // Create a temporary file let tmp = Builder::new() .prefix("test_int96_stats") @@ -41,20 +40,6 @@ fn test_int96_stats() { .set_statistics_enabled(EnabledStatistics::Page) .build(); - // Create INT96 data from timestamps - let data = vec![ - datetime_to_int96("2020-01-01 00:00:00.000"), // New Year 2020 - datetime_to_int96("2020-02-29 23:59:59.999"), // Leap day 2020 - datetime_to_int96("2020-12-31 23:59:59.999"), // End of 2020 - datetime_to_int96("2021-01-01 00:00:00.000"), // Start of 2021 - datetime_to_int96("2023-06-15 12:30:45.500"), // Mid-2023 - datetime_to_int96("2024-02-29 15:45:30.750"), // Leap day 2024 - datetime_to_int96("2024-12-25 07:00:00.000"), // Christmas 2024 - datetime_to_int96("2025-01-01 00:00:00.000"), // New Year 2025 - datetime_to_int96("2025-07-04 20:00:00.000"), // July 4th 2025 - datetime_to_int96("2025-12-31 23:59:59.999"), // End of 2025 - ]; - let expected_min = data[0]; let expected_max = data[data.len() - 1]; @@ -97,4 +82,21 @@ fn test_int96_stats() { } else { panic!("Expected Int96 statistics"); } +} + +#[test] +fn test_int96_stats() { + let data = vec![ + datetime_to_int96("2020-01-01 00:00:00.000"), // New Year 2020 + datetime_to_int96("2020-02-29 23:59:59.999"), // Leap day 2020 + datetime_to_int96("2020-12-31 23:59:59.999"), // End of 2020 + datetime_to_int96("2021-01-01 00:00:00.000"), // Start of 2021 + datetime_to_int96("2023-06-15 12:30:45.500"), // Mid-2023 + datetime_to_int96("2024-02-29 15:45:30.750"), // Leap day 2024 + datetime_to_int96("2024-12-25 07:00:00.000"), // Christmas 2024 + datetime_to_int96("2025-01-01 00:00:00.000"), // New Year 2025 + datetime_to_int96("2025-07-04 20:00:00.000"), // July 4th 2025 + datetime_to_int96("2025-12-31 23:59:59.999"), // End of 2025 + ]; + verify_ordering(data); } \ No newline at end of file From 515555613c902cecbeb909a5fefffdadf711b610 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 17 Apr 2025 16:17:21 +0000 Subject: [PATCH 13/26] simplify test --- parquet/tests/int96_stats_roundtrip.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/parquet/tests/int96_stats_roundtrip.rs b/parquet/tests/int96_stats_roundtrip.rs index 3b5df1a02b48..327a4f0b6a19 100644 --- a/parquet/tests/int96_stats_roundtrip.rs +++ b/parquet/tests/int96_stats_roundtrip.rs @@ -87,16 +87,16 @@ fn verify_ordering(data: Vec) { #[test] fn test_int96_stats() { let data = vec![ - datetime_to_int96("2020-01-01 00:00:00.000"), // New Year 2020 - datetime_to_int96("2020-02-29 23:59:59.999"), // Leap day 2020 - datetime_to_int96("2020-12-31 23:59:59.999"), // End of 2020 - datetime_to_int96("2021-01-01 00:00:00.000"), // Start of 2021 - datetime_to_int96("2023-06-15 12:30:45.500"), // Mid-2023 - datetime_to_int96("2024-02-29 15:45:30.750"), // Leap day 2024 - datetime_to_int96("2024-12-25 07:00:00.000"), // Christmas 2024 - datetime_to_int96("2025-01-01 00:00:00.000"), // New Year 2025 - datetime_to_int96("2025-07-04 20:00:00.000"), // July 4th 2025 - datetime_to_int96("2025-12-31 23:59:59.999"), // End of 2025 + datetime_to_int96("2020-01-01 00:00:00.000"), + datetime_to_int96("2020-02-29 23:59:59.999"), + datetime_to_int96("2020-12-31 23:59:59.999"), + datetime_to_int96("2021-01-01 00:00:00.000"), + datetime_to_int96("2023-06-15 12:30:45.500"), + datetime_to_int96("2024-02-29 15:45:30.750"), + datetime_to_int96("2024-12-25 07:00:00.000"), + datetime_to_int96("2025-01-01 00:00:00.000"), + datetime_to_int96("2025-07-04 20:00:00.000"), + datetime_to_int96("2025-12-31 23:59:59.999"), ]; verify_ordering(data); -} \ No newline at end of file +} From 4b0b94dd7e5ea3b97556a63752f39a5f79bd8dd1 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 17 Jun 2025 14:13:15 +0000 Subject: [PATCH 14/26] Improve testcase --- parquet/tests/int96_stats_roundtrip.rs | 37 ++++++++++++++++++-------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/parquet/tests/int96_stats_roundtrip.rs b/parquet/tests/int96_stats_roundtrip.rs index 327a4f0b6a19..65b7a04b5eb8 100644 --- a/parquet/tests/int96_stats_roundtrip.rs +++ b/parquet/tests/int96_stats_roundtrip.rs @@ -43,14 +43,12 @@ fn verify_ordering(data: Vec) { let expected_min = data[0]; let expected_max = data[data.len() - 1]; - // Create writer and write data { let file = File::create(&file_path).unwrap(); let mut writer = SerializedFileWriter::new(file, schema.into(), Arc::new(props)).unwrap(); let mut row_group = writer.next_row_group().unwrap(); let mut col_writer = row_group.next_column().unwrap().unwrap(); - // Write the data { let writer = col_writer.typed::(); writer.write_batch(&data, None, None).unwrap(); @@ -60,14 +58,12 @@ fn verify_ordering(data: Vec) { writer.close().unwrap(); } - // Read the file back let file = File::open(&file_path).unwrap(); let reader = SerializedFileReader::new(file).unwrap(); let metadata = reader.metadata(); let row_group = metadata.row_group(0); let column = row_group.column(0); - // Get the statistics let stats = column.statistics().unwrap(); assert_eq!(stats.physical_type(), Type::INT96); @@ -75,7 +71,6 @@ fn verify_ordering(data: Vec) { let min = stats.min_opt().unwrap(); let max = stats.max_opt().unwrap(); - // Verify the statistics assert_eq!(*min, expected_min, "Min value should be {} but was {}", expected_min, min); assert_eq!(*max, expected_max, "Max value should be {} but was {}", expected_max, max); assert_eq!(stats.null_count_opt(), Some(0)); @@ -85,18 +80,38 @@ fn verify_ordering(data: Vec) { } #[test] -fn test_int96_stats() { +fn test_multiple_dates() { let data = vec![ datetime_to_int96("2020-01-01 00:00:00.000"), - datetime_to_int96("2020-02-29 23:59:59.999"), - datetime_to_int96("2020-12-31 23:59:59.999"), + datetime_to_int96("2020-02-29 23:59:59.000"), + datetime_to_int96("2020-12-31 23:59:59.000"), datetime_to_int96("2021-01-01 00:00:00.000"), - datetime_to_int96("2023-06-15 12:30:45.500"), - datetime_to_int96("2024-02-29 15:45:30.750"), + datetime_to_int96("2023-06-15 12:30:45.000"), + datetime_to_int96("2024-02-29 15:45:30.000"), datetime_to_int96("2024-12-25 07:00:00.000"), datetime_to_int96("2025-01-01 00:00:00.000"), datetime_to_int96("2025-07-04 20:00:00.000"), - datetime_to_int96("2025-12-31 23:59:59.999"), + datetime_to_int96("2025-12-31 23:59:59.000"), + ]; + verify_ordering(data); +} + +#[test] +fn test_same_day_different_time() { + let data = vec![ + datetime_to_int96("2020-01-01 00:01:00.000"), + datetime_to_int96("2020-01-01 00:02:00.000"), + datetime_to_int96("2020-01-01 00:03:00.000"), + ]; + verify_ordering(data); +} + +#[test] +fn test_increasing_day_decreasing_time() { + let data = vec![ + datetime_to_int96("2020-01-01 12:00:00.000"), + datetime_to_int96("2020-02-01 11:00:00.000"), + datetime_to_int96("2020-03-01 10:00:00.000"), ]; verify_ordering(data); } From 325d3354c962b0cea0e5c67e4d6a83bad90fa013 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 17 Jun 2025 14:18:03 +0000 Subject: [PATCH 15/26] shuffle data before writing to file --- parquet/tests/int96_stats_roundtrip.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parquet/tests/int96_stats_roundtrip.rs b/parquet/tests/int96_stats_roundtrip.rs index 65b7a04b5eb8..9155fa05d0f2 100644 --- a/parquet/tests/int96_stats_roundtrip.rs +++ b/parquet/tests/int96_stats_roundtrip.rs @@ -5,6 +5,7 @@ use parquet::file::reader::{FileReader, SerializedFileReader}; use parquet::file::statistics::Statistics; use parquet::file::writer::SerializedFileWriter; use parquet::schema::parser::parse_message_type; +use rand::seq::SliceRandom; use std::fs::File; use std::sync::Arc; use tempfile::Builder; @@ -51,7 +52,9 @@ fn verify_ordering(data: Vec) { { let writer = col_writer.typed::(); - writer.write_batch(&data, None, None).unwrap(); + let mut shuffled_data = data.clone(); + shuffled_data.shuffle(&mut rand::rng()); + writer.write_batch(&shuffled_data, None, None).unwrap(); } col_writer.close().unwrap(); row_group.close().unwrap(); From 6036398bd7a37f6f3bd824c5198fd0e7785f9526 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 17 Jun 2025 15:51:12 +0000 Subject: [PATCH 16/26] change int96 internal format --- parquet/src/data_type.rs | 101 ++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 55 deletions(-) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 6b2098edd40a..e92626dabd86 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -35,7 +35,10 @@ use crate::util::bit_util::FromBytes; /// The type only takes 12 bytes, without extra padding. #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub struct Int96 { - value: [u32; 3], + /// First 8 bytes store nanoseconds since midnight + pub nanos: i64, + /// Last 4 bytes store Julian days + pub days: i32, } const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588; @@ -59,19 +62,25 @@ const NANOSECONDS_IN_DAY: i64 = SECONDS_IN_DAY * NANOSECONDS; impl Int96 { /// Creates new INT96 type struct with no data set. pub fn new() -> Self { - Self { value: [0; 3] } + Self { nanos: 0, days: 0 } } - /// Returns underlying data as slice of [`u32`]. + /// Returns underlying data as slice of [`u32`] for compatibility with Parquet format #[inline] pub fn data(&self) -> &[u32] { - &self.value + // SAFETY: We're reinterpreting the bytes of our struct as [u32; 3] + // This is safe because: + // 1. The memory layout is compatible (12 bytes total) + // 2. The alignment requirements are met (u32 requires 4-byte alignment) + // 3. We maintain the invariant that the bytes are always valid u32s + unsafe { std::slice::from_raw_parts(self as *const Int96 as *const u32, 3) } } - /// Sets data for this INT96 type. + /// Sets data for this INT96 type from raw Parquet format. #[inline] pub fn set_data(&mut self, elem0: u32, elem1: u32, elem2: u32) { - self.value = [elem0, elem1, elem2]; + self.nanos = ((elem1 as i64) << 32) | (elem0 as i64); + self.days = elem2 as i32; } /// Converts this INT96 into an i64 representing the number of MILLISECONDS since Epoch @@ -124,33 +133,19 @@ impl Int96 { .wrapping_add(nanos) } - /// Sets the INT96 data from seconds since epoch - /// - /// Will wrap around on overflow - #[inline] - pub fn set_data_from_seconds(&mut self, seconds: i64) { - self.set_data_from_nanos(seconds.wrapping_mul(NANOSECONDS)); - } - - /// Sets the INT96 data from milliseconds since epoch - /// - /// Will wrap around on overflow + /// Sets the INT96 data directly from days and nanoseconds #[inline] - pub fn set_data_from_millis(&mut self, millis: i64) { - self.set_data_from_nanos(millis.wrapping_mul(MICROSECONDS)); + pub fn set_data_from_days_and_nanos(&mut self, days: i32, nanos: i64) { + self.days = days; + self.nanos = nanos; } - /// Sets the INT96 data from microseconds since epoch - /// - /// Will wrap around on overflow #[inline] - pub fn set_data_from_micros(&mut self, micros: i64) { - self.set_data_from_nanos(micros.wrapping_mul(MILLISECONDS)); + fn data_as_days_and_nanos(&self) -> (i32, i64) { + (self.days, self.nanos) } /// Sets the INT96 data from nanoseconds since epoch - /// - /// Will wrap around on overflow #[inline] pub fn set_data_from_nanos(&mut self, nanos: i64) { let days = nanos / NANOSECONDS_IN_DAY; @@ -159,24 +154,32 @@ impl Int96 { self.set_data_from_days_and_nanos(julian_day, remaining_nanos); } - /// Sets the INT96 data directly from days and nanoseconds - /// - /// This is the most direct way to set the Int96 data structure which internally - /// stores days and nanoseconds. The days should be Julian days since epoch. + /// Sets the INT96 data from seconds since epoch + #[inline] + pub fn set_data_from_seconds(&mut self, seconds: i64) { + self.set_data_from_nanos(seconds.wrapping_mul(NANOSECONDS)) + } + /// Sets the INT96 data from milliseconds since epoch #[inline] - pub fn set_data_from_days_and_nanos(&mut self, days: i32, nanos: i64) { - let julian_day = (days as i32) as u32; - let nanos_low = (nanos & 0xFFFFFFFF) as u32; - let nanos_high = ((nanos >> 32) & 0xFFFFFFFF) as u32; - self.set_data(nanos_low, nanos_high, julian_day); + pub fn set_data_from_millis(&mut self, millis: i64) { + self.set_data_from_nanos(millis.wrapping_mul(MICROSECONDS)) } + /// Sets the INT96 data from microseconds since epoch #[inline] - fn data_as_days_and_nanos(&self) -> (i32, i64) { - let day = self.data()[2] as i32; - let nanos = ((self.data()[1] as i64) << 32) + self.data()[0] as i64; - (day, nanos) + pub fn set_data_from_micros(&mut self, micros: i64) { + self.set_data_from_nanos(micros.wrapping_mul(MILLISECONDS)) + } +} + + +impl From> for Int96 { + fn from(buf: Vec) -> Self { + assert_eq!(buf.len(), 3); + let mut result = Self::new(); + result.set_data(buf[0], buf[1], buf[2]); + result } } @@ -188,25 +191,13 @@ impl PartialOrd for Int96 { impl Ord for Int96 { fn cmp(&self, other: &Self) -> Ordering { - let (self_days, self_nanos) = self.data_as_days_and_nanos(); - let (other_days, other_nanos) = other.data_as_days_and_nanos(); - - match self_days.cmp(&other_days) { - Ordering::Equal => self_nanos.cmp(&other_nanos), + match self.days.cmp(&other.days) { + Ordering::Equal => self.nanos.cmp(&other.nanos), ord => ord, } } } -impl From> for Int96 { - fn from(buf: Vec) -> Self { - assert_eq!(buf.len(), 3); - let mut result = Self::new(); - result.set_data(buf[0], buf[1], buf[2]); - result - } -} - impl fmt::Display for Int96 { #[cold] fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { @@ -675,8 +666,8 @@ impl AsBytes for bool { impl AsBytes for Int96 { fn as_bytes(&self) -> &[u8] { - // SAFETY: Int96::data is a &[u32; 3]. - unsafe { std::slice::from_raw_parts(self.data() as *const [u32] as *const u8, 12) } + // SAFETY: The layout of Int96 is i64 followed by i32, which is 12 contiguous bytes + unsafe { std::slice::from_raw_parts(self as *const Int96 as *const u8, 12) } } } From a4b004957628e75e55bd45f0aa6629735f982435 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Wed, 25 Jun 2025 09:34:52 +0000 Subject: [PATCH 17/26] Revert "change int96 internal format" This reverts commit 6036398bd7a37f6f3bd824c5198fd0e7785f9526. --- parquet/src/data_type.rs | 101 +++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 46 deletions(-) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index e92626dabd86..6b2098edd40a 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -35,10 +35,7 @@ use crate::util::bit_util::FromBytes; /// The type only takes 12 bytes, without extra padding. #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub struct Int96 { - /// First 8 bytes store nanoseconds since midnight - pub nanos: i64, - /// Last 4 bytes store Julian days - pub days: i32, + value: [u32; 3], } const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588; @@ -62,25 +59,19 @@ const NANOSECONDS_IN_DAY: i64 = SECONDS_IN_DAY * NANOSECONDS; impl Int96 { /// Creates new INT96 type struct with no data set. pub fn new() -> Self { - Self { nanos: 0, days: 0 } + Self { value: [0; 3] } } - /// Returns underlying data as slice of [`u32`] for compatibility with Parquet format + /// Returns underlying data as slice of [`u32`]. #[inline] pub fn data(&self) -> &[u32] { - // SAFETY: We're reinterpreting the bytes of our struct as [u32; 3] - // This is safe because: - // 1. The memory layout is compatible (12 bytes total) - // 2. The alignment requirements are met (u32 requires 4-byte alignment) - // 3. We maintain the invariant that the bytes are always valid u32s - unsafe { std::slice::from_raw_parts(self as *const Int96 as *const u32, 3) } + &self.value } - /// Sets data for this INT96 type from raw Parquet format. + /// Sets data for this INT96 type. #[inline] pub fn set_data(&mut self, elem0: u32, elem1: u32, elem2: u32) { - self.nanos = ((elem1 as i64) << 32) | (elem0 as i64); - self.days = elem2 as i32; + self.value = [elem0, elem1, elem2]; } /// Converts this INT96 into an i64 representing the number of MILLISECONDS since Epoch @@ -133,19 +124,33 @@ impl Int96 { .wrapping_add(nanos) } - /// Sets the INT96 data directly from days and nanoseconds + /// Sets the INT96 data from seconds since epoch + /// + /// Will wrap around on overflow #[inline] - pub fn set_data_from_days_and_nanos(&mut self, days: i32, nanos: i64) { - self.days = days; - self.nanos = nanos; + pub fn set_data_from_seconds(&mut self, seconds: i64) { + self.set_data_from_nanos(seconds.wrapping_mul(NANOSECONDS)); } + /// Sets the INT96 data from milliseconds since epoch + /// + /// Will wrap around on overflow #[inline] - fn data_as_days_and_nanos(&self) -> (i32, i64) { - (self.days, self.nanos) + pub fn set_data_from_millis(&mut self, millis: i64) { + self.set_data_from_nanos(millis.wrapping_mul(MICROSECONDS)); + } + + /// Sets the INT96 data from microseconds since epoch + /// + /// Will wrap around on overflow + #[inline] + pub fn set_data_from_micros(&mut self, micros: i64) { + self.set_data_from_nanos(micros.wrapping_mul(MILLISECONDS)); } /// Sets the INT96 data from nanoseconds since epoch + /// + /// Will wrap around on overflow #[inline] pub fn set_data_from_nanos(&mut self, nanos: i64) { let days = nanos / NANOSECONDS_IN_DAY; @@ -154,32 +159,24 @@ impl Int96 { self.set_data_from_days_and_nanos(julian_day, remaining_nanos); } - /// Sets the INT96 data from seconds since epoch - #[inline] - pub fn set_data_from_seconds(&mut self, seconds: i64) { - self.set_data_from_nanos(seconds.wrapping_mul(NANOSECONDS)) - } + /// Sets the INT96 data directly from days and nanoseconds + /// + /// This is the most direct way to set the Int96 data structure which internally + /// stores days and nanoseconds. The days should be Julian days since epoch. - /// Sets the INT96 data from milliseconds since epoch #[inline] - pub fn set_data_from_millis(&mut self, millis: i64) { - self.set_data_from_nanos(millis.wrapping_mul(MICROSECONDS)) + pub fn set_data_from_days_and_nanos(&mut self, days: i32, nanos: i64) { + let julian_day = (days as i32) as u32; + let nanos_low = (nanos & 0xFFFFFFFF) as u32; + let nanos_high = ((nanos >> 32) & 0xFFFFFFFF) as u32; + self.set_data(nanos_low, nanos_high, julian_day); } - /// Sets the INT96 data from microseconds since epoch #[inline] - pub fn set_data_from_micros(&mut self, micros: i64) { - self.set_data_from_nanos(micros.wrapping_mul(MILLISECONDS)) - } -} - - -impl From> for Int96 { - fn from(buf: Vec) -> Self { - assert_eq!(buf.len(), 3); - let mut result = Self::new(); - result.set_data(buf[0], buf[1], buf[2]); - result + fn data_as_days_and_nanos(&self) -> (i32, i64) { + let day = self.data()[2] as i32; + let nanos = ((self.data()[1] as i64) << 32) + self.data()[0] as i64; + (day, nanos) } } @@ -191,13 +188,25 @@ impl PartialOrd for Int96 { impl Ord for Int96 { fn cmp(&self, other: &Self) -> Ordering { - match self.days.cmp(&other.days) { - Ordering::Equal => self.nanos.cmp(&other.nanos), + let (self_days, self_nanos) = self.data_as_days_and_nanos(); + let (other_days, other_nanos) = other.data_as_days_and_nanos(); + + match self_days.cmp(&other_days) { + Ordering::Equal => self_nanos.cmp(&other_nanos), ord => ord, } } } +impl From> for Int96 { + fn from(buf: Vec) -> Self { + assert_eq!(buf.len(), 3); + let mut result = Self::new(); + result.set_data(buf[0], buf[1], buf[2]); + result + } +} + impl fmt::Display for Int96 { #[cold] fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { @@ -666,8 +675,8 @@ impl AsBytes for bool { impl AsBytes for Int96 { fn as_bytes(&self) -> &[u8] { - // SAFETY: The layout of Int96 is i64 followed by i32, which is 12 contiguous bytes - unsafe { std::slice::from_raw_parts(self as *const Int96 as *const u8, 12) } + // SAFETY: Int96::data is a &[u32; 3]. + unsafe { std::slice::from_raw_parts(self.data() as *const [u32] as *const u8, 12) } } } From 4bca51b58da1aed648609b338d3976bba747f604 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Wed, 25 Jun 2025 09:48:07 +0000 Subject: [PATCH 18/26] Make the changes minimal --- parquet/src/data_type.rs | 120 +------------------------ parquet/tests/int96_stats_roundtrip.rs | 10 ++- 2 files changed, 10 insertions(+), 120 deletions(-) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 6b2098edd40a..fddd465f308d 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -124,54 +124,6 @@ impl Int96 { .wrapping_add(nanos) } - /// Sets the INT96 data from seconds since epoch - /// - /// Will wrap around on overflow - #[inline] - pub fn set_data_from_seconds(&mut self, seconds: i64) { - self.set_data_from_nanos(seconds.wrapping_mul(NANOSECONDS)); - } - - /// Sets the INT96 data from milliseconds since epoch - /// - /// Will wrap around on overflow - #[inline] - pub fn set_data_from_millis(&mut self, millis: i64) { - self.set_data_from_nanos(millis.wrapping_mul(MICROSECONDS)); - } - - /// Sets the INT96 data from microseconds since epoch - /// - /// Will wrap around on overflow - #[inline] - pub fn set_data_from_micros(&mut self, micros: i64) { - self.set_data_from_nanos(micros.wrapping_mul(MILLISECONDS)); - } - - /// Sets the INT96 data from nanoseconds since epoch - /// - /// Will wrap around on overflow - #[inline] - pub fn set_data_from_nanos(&mut self, nanos: i64) { - let days = nanos / NANOSECONDS_IN_DAY; - let remaining_nanos = nanos % NANOSECONDS_IN_DAY; - let julian_day = (days + JULIAN_DAY_OF_EPOCH) as i32; - self.set_data_from_days_and_nanos(julian_day, remaining_nanos); - } - - /// Sets the INT96 data directly from days and nanoseconds - /// - /// This is the most direct way to set the Int96 data structure which internally - /// stores days and nanoseconds. The days should be Julian days since epoch. - - #[inline] - pub fn set_data_from_days_and_nanos(&mut self, days: i32, nanos: i64) { - let julian_day = (days as i32) as u32; - let nanos_low = (nanos & 0xFFFFFFFF) as u32; - let nanos_high = ((nanos >> 32) & 0xFFFFFFFF) as u32; - self.set_data(nanos_low, nanos_high, julian_day); - } - #[inline] fn data_as_days_and_nanos(&self) -> (i32, i64) { let day = self.data()[2] as i32; @@ -190,14 +142,13 @@ impl Ord for Int96 { fn cmp(&self, other: &Self) -> Ordering { let (self_days, self_nanos) = self.data_as_days_and_nanos(); let (other_days, other_nanos) = other.data_as_days_and_nanos(); - + match self_days.cmp(&other_days) { Ordering::Equal => self_nanos.cmp(&other_nanos), ord => ord, } } } - impl From> for Int96 { fn from(buf: Vec) -> Self { assert_eq!(buf.len(), 3); @@ -1475,73 +1426,4 @@ mod tests { assert_eq!(ba1, ba11); assert!(ba5 > ba1); } - - #[test] - fn test_int96_time_conversions() { - let test_values = [ - 0, 1, 60, 3600, 86400, 1234567, 31536000, - ]; - - for &value in &test_values { - let mut i96: Int96 = Int96::new(); - - i96.set_data_from_seconds(value); - assert_eq!(i96.to_seconds(), value, "seconds roundtrip failed for {}", value); - - i96.set_data_from_millis(value); - assert_eq!(i96.to_millis(), value, "millis roundtrip failed for {}", value); - - i96.set_data_from_micros(value); - assert_eq!(i96.to_micros(), value, "micros roundtrip failed for {}", value); - - i96.set_data_from_nanos(value); - assert_eq!(i96.to_nanos(), value, "nanos roundtrip failed for {}", value); - - let test_day_nanos = [ - (0, 0), // 1970-01-01 00:00:00.000000000 (Unix epoch) - (0, 1), // 1970-01-01 00:00:00.000000001 - (0, NANOSECONDS - 1), // 1970-01-01 00:00:00.999999999 - (0, NANOSECONDS), // 1970-01-01 00:00:01.000000000 - (1, 0), // 1970-01-02 00:00:00.000000000 - (1, NANOSECONDS), // 1970-01-02 00:00:01.000000000 - (365, 0), // 1971-01-01 00:00:00.000000000 (1 year after epoch) - (365, NANOSECONDS * 3600), // 1971-01-01 01:00:00.000000000 - (10957, 0), // 2000-01-01 00:00:00.000000000 (Y2K) - (18262, 0), // 2020-01-01 00:00:00.000000000 - (18262, NANOSECONDS * 3600 * 12), // 2020-01-01 12:00:00.000000000 - ]; - - for &(days, nanos) in &test_day_nanos { - let mut i96 = Int96::new(); - i96.set_data_from_days_and_nanos(days, nanos); - let (roundtrip_days, roundtrip_nanos) = i96.data_as_days_and_nanos(); - assert_eq!(roundtrip_days, days, "days roundtrip failed for days={}, nanos={}", days, nanos); - assert_eq!(roundtrip_nanos, nanos, "nanos roundtrip failed for days={}, nanos={}", days, nanos); - } - } - } - - #[test] - fn test_int96_ord() { - let test_pairs = [ - - ((99, 5), (100, 4)), - ((100, 10), (100, 5523)), - ((0, 0), (100, 0)), - ((10000, 1_000_000_000), (10000, 2_000_000_000)), - ((10000, 1_000_000_000), (20000, 1_000_000_000)), - ]; - - for (smaller, larger) in test_pairs { - let mut small = Int96::new(); - small.set_data_from_days_and_nanos(smaller.0, smaller.1); - let mut large = Int96::new(); - large.set_data_from_days_and_nanos(larger.0, larger.1); - - assert!(small < large, "Expected {:?} < {:?}", smaller, larger); - assert!(large > small, "Expected {:?} > {:?}", larger, smaller); - assert!(small == small, "Expected {:?} == {:?}", smaller, smaller); - assert!(large == large, "Expected {:?} == {:?}", larger, larger); - } - } } diff --git a/parquet/tests/int96_stats_roundtrip.rs b/parquet/tests/int96_stats_roundtrip.rs index 9155fa05d0f2..aa74a6d2d77c 100644 --- a/parquet/tests/int96_stats_roundtrip.rs +++ b/parquet/tests/int96_stats_roundtrip.rs @@ -16,7 +16,15 @@ fn datetime_to_int96(dt: &str) -> Int96 { let datetime: DateTime = DateTime::from_naive_utc_and_offset(naive, Utc); let nanos = datetime.timestamp_nanos_opt().unwrap(); let mut int96 = Int96::new(); - int96.set_data_from_nanos(nanos); + const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588; + const NANOSECONDS_IN_DAY: i64 = 86_400_000_000_000; + let days = nanos / NANOSECONDS_IN_DAY; + let remaining_nanos = nanos % NANOSECONDS_IN_DAY; + let julian_day = (days + JULIAN_DAY_OF_EPOCH) as i32; + let julian_day_u32 = julian_day as u32; + let nanos_low = (remaining_nanos & 0xFFFFFFFF) as u32; + let nanos_high = ((remaining_nanos >> 32) & 0xFFFFFFFF) as u32; + int96.set_data(nanos_low, nanos_high, julian_day_u32); int96 } From f18442932d9cbdbfc2214562f4dadd4eb8a4bb96 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 3 Jul 2025 09:25:42 +0000 Subject: [PATCH 19/26] Save instructions in the comparison --- parquet/src/data_type.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index fddd465f308d..85d9fe044f8e 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -124,11 +124,19 @@ impl Int96 { .wrapping_add(nanos) } + #[inline] + fn get_days(&self) -> i32 { + self.data()[2] as i32 + } + + #[inline] + fn get_nanos(&self) -> i64 { + ((self.data()[1] as i64) << 32) + self.data()[0] as i64 + } + #[inline] fn data_as_days_and_nanos(&self) -> (i32, i64) { - let day = self.data()[2] as i32; - let nanos = ((self.data()[1] as i64) << 32) + self.data()[0] as i64; - (day, nanos) + (self.get_days(), self.get_nanos()) } } @@ -140,11 +148,8 @@ impl PartialOrd for Int96 { impl Ord for Int96 { fn cmp(&self, other: &Self) -> Ordering { - let (self_days, self_nanos) = self.data_as_days_and_nanos(); - let (other_days, other_nanos) = other.data_as_days_and_nanos(); - - match self_days.cmp(&other_days) { - Ordering::Equal => self_nanos.cmp(&other_nanos), + match self.get_days().cmp(&other.get_days()) { + Ordering::Equal => self.get_nanos().cmp(&other.get_nanos()), ord => ord, } } From 081d20f0c8b546d158393cc8bf17d35d0e846f1e Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 3 Jul 2025 09:31:37 +0000 Subject: [PATCH 20/26] Fix test --- parquet/src/column/writer/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index efc7993c70d6..a1243d30872b 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -2527,8 +2527,8 @@ mod tests { let stats = statistics_roundtrip::(&input); assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Int96(stats) = stats { - assert_eq!(stats.min_opt().unwrap(), &Int96::from(vec![0, 20, 30])); - assert_eq!(stats.max_opt().unwrap(), &Int96::from(vec![3, 20, 10])); + assert_eq!(stats.min_opt().unwrap(), &Int96::from(vec![3, 20, 10])); + assert_eq!(stats.max_opt().unwrap(), &Int96::from(vec![2, 20, 30])); } else { panic!("expecting Statistics::Int96, got {stats:?}"); } From 2a3e8029eba5af58c20064ba341dd07cc0a08782 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 3 Jul 2025 09:34:06 +0000 Subject: [PATCH 21/26] Add comments explaining the reasoning behind the new ordering --- parquet/src/data_type.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 85d9fe044f8e..11d425f6ff7b 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -147,6 +147,14 @@ impl PartialOrd for Int96 { } impl Ord for Int96 { + /// Order `Int96` correctly for (deprecated) timestamp types. + /// + /// Note: this is done even though the Int96 type is deprecated and the + /// [spec does not define the sort order] + /// because some engines, notably Spark and DataBricks photon still write + /// Int96 timestamps and rely on their order for optimization. + /// + /// [spec does not define the sort order]: https://github.com/apache/parquet-format/blob/cf943c197f4fad826b14ba0c40eb0ffdab585285/src/main/thrift/parquet.thrift#L1079 fn cmp(&self, other: &Self) -> Ordering { match self.get_days().cmp(&other.get_days()) { Ordering::Equal => self.get_nanos().cmp(&other.get_nanos()), From 3c1d4b03c5259efaea9d5f81a7108e36cbe2a044 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Thu, 3 Jul 2025 09:35:49 +0000 Subject: [PATCH 22/26] Remove stale comment --- parquet/src/file/statistics.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index b7522a76f0fc..0bf0de0fbccf 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -210,9 +210,6 @@ pub fn from_thrift( old_format, ), Type::INT96 => { - // INT96 statistics may not be correct, because comparison is signed - // byte-wise, not actual timestamps. It is recommended to ignore - // min/max statistics for INT96 columns. let min = if let Some(data) = min { assert_eq!(data.len(), 12); Some(Int96::try_from_le_slice(&data)?) From 84b05cee030a6fa5476f6905ba79e71d36d4c72e Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 3 Jul 2025 06:33:51 -0700 Subject: [PATCH 23/26] clippy --- parquet/tests/int96_stats_roundtrip.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/tests/int96_stats_roundtrip.rs b/parquet/tests/int96_stats_roundtrip.rs index aa74a6d2d77c..f23bedaddbc6 100644 --- a/parquet/tests/int96_stats_roundtrip.rs +++ b/parquet/tests/int96_stats_roundtrip.rs @@ -82,8 +82,8 @@ fn verify_ordering(data: Vec) { let min = stats.min_opt().unwrap(); let max = stats.max_opt().unwrap(); - assert_eq!(*min, expected_min, "Min value should be {} but was {}", expected_min, min); - assert_eq!(*max, expected_max, "Max value should be {} but was {}", expected_max, max); + assert_eq!(*min, expected_min, "Min value should be {expected_min} but was {min}"); + assert_eq!(*max, expected_max, "Max value should be {expected_max} but was {max}"); assert_eq!(stats.null_count_opt(), Some(0)); } else { panic!("Expected Int96 statistics"); From c008cadb2d9f7f61af8ea5de09ae0354488312db Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 3 Jul 2025 06:34:45 -0700 Subject: [PATCH 24/26] lint --- parquet/tests/int96_stats_roundtrip.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/parquet/tests/int96_stats_roundtrip.rs b/parquet/tests/int96_stats_roundtrip.rs index f23bedaddbc6..c3e0800d48ce 100644 --- a/parquet/tests/int96_stats_roundtrip.rs +++ b/parquet/tests/int96_stats_roundtrip.rs @@ -1,3 +1,4 @@ +use chrono::{DateTime, NaiveDateTime, Utc}; use parquet::basic::Type; use parquet::data_type::{Int96, Int96Type}; use parquet::file::properties::{EnabledStatistics, WriterProperties}; @@ -9,7 +10,6 @@ use rand::seq::SliceRandom; use std::fs::File; use std::sync::Arc; use tempfile::Builder; -use chrono::{DateTime, NaiveDateTime, Utc}; fn datetime_to_int96(dt: &str) -> Int96 { let naive = NaiveDateTime::parse_from_str(dt, "%Y-%m-%d %H:%M:%S%.f").unwrap(); @@ -77,13 +77,19 @@ fn verify_ordering(data: Vec) { let stats = column.statistics().unwrap(); assert_eq!(stats.physical_type(), Type::INT96); - + if let Statistics::Int96(stats) = stats { let min = stats.min_opt().unwrap(); let max = stats.max_opt().unwrap(); - - assert_eq!(*min, expected_min, "Min value should be {expected_min} but was {min}"); - assert_eq!(*max, expected_max, "Max value should be {expected_max} but was {max}"); + + assert_eq!( + *min, expected_min, + "Min value should be {expected_min} but was {min}" + ); + assert_eq!( + *max, expected_max, + "Max value should be {expected_max} but was {max}" + ); assert_eq!(stats.null_count_opt(), Some(0)); } else { panic!("Expected Int96 statistics"); From cd63d0a06bd846ad1471cc22c5df1cd9d68028bf Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 3 Jul 2025 06:39:51 -0700 Subject: [PATCH 25/26] add license statement --- parquet/tests/int96_stats_roundtrip.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/parquet/tests/int96_stats_roundtrip.rs b/parquet/tests/int96_stats_roundtrip.rs index c3e0800d48ce..d6ba8d419e3e 100644 --- a/parquet/tests/int96_stats_roundtrip.rs +++ b/parquet/tests/int96_stats_roundtrip.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use chrono::{DateTime, NaiveDateTime, Utc}; use parquet::basic::Type; use parquet::data_type::{Int96, Int96Type}; From 810b25cec1bb1a899ef4ef036f4674d73f155c81 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Jul 2025 11:29:01 -0400 Subject: [PATCH 26/26] Update parquet/src/data_type.rs Co-authored-by: Alkis Evlogimenos --- parquet/src/data_type.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index aa7de262bcb2..6cba02ab3eea 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -145,7 +145,7 @@ impl Ord for Int96 { /// /// Note: this is done even though the Int96 type is deprecated and the /// [spec does not define the sort order] - /// because some engines, notably Spark and DataBricks photon still write + /// because some engines, notably Spark and Databricks Photon still write /// Int96 timestamps and rely on their order for optimization. /// /// [spec does not define the sort order]: https://github.com/apache/parquet-format/blob/cf943c197f4fad826b14ba0c40eb0ffdab585285/src/main/thrift/parquet.thrift#L1079