From d8433f2e06acc13f398850c3d03600ddb752dbe2 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 20 May 2026 16:00:01 +0800
Subject: [PATCH 1/3] feat: enhance format_string with local IntegerValue
 adapter and optimize match arms

- Added a local IntegerValue adapter to format_string functionality.
- Collapsed duplicated match arms for %d, %x, %o, %s, %c to improve code efficiency.
- Ensured preservation of signed width behavior for negative hex and octal values.
- Introduced a regression table test covering integer widths and null values.
---
 .../src/function/string/format_string.rs      | 385 +++++++++---------
 1 file changed, 189 insertions(+), 196 deletions(-)
diff --git a/datafusion/spark/src/function/string/format_string.rs b/datafusion/spark/src/function/string/format_string.rs
index 51e4ebfa7b465..2cec1790e69b3 100644
--- a/datafusion/spark/src/function/string/format_string.rs
+++ b/datafusion/spark/src/function/string/format_string.rs
@@ -891,20 +891,46 @@ fn unsigned_to_char(value: u64) -> Result<char> {
     codepoint_to_char(codepoint)
 }
 
-/// Convert a non-null integer scalar to a [`char`] for the `%c` conversion.
-fn integer_scalar_to_char(scalar: &ScalarValue) -> Result<char> {
-    match scalar {
-        ScalarValue::Int8(Some(value)) => signed_to_char(*value as i64),
-        ScalarValue::Int16(Some(value)) => signed_to_char(*value as i64),
-        ScalarValue::Int32(Some(value)) => signed_to_char(*value as i64),
-        ScalarValue::Int64(Some(value)) => signed_to_char(*value),
-        ScalarValue::UInt8(Some(value)) => unsigned_to_char(*value as u64),
-        ScalarValue::UInt16(Some(value)) => unsigned_to_char(*value as u64),
-        ScalarValue::UInt32(Some(value)) => unsigned_to_char(*value as u64),
-        ScalarValue::UInt64(Some(value)) => unsigned_to_char(*value),
-        _ => datafusion_common::internal_err!(
-            "integer_scalar_to_char expects a non-null integer scalar, got {scalar:?}"
-        ),
+#[derive(Debug, Clone, Copy)]
+enum IntegerValue {
+    Signed { decimal: i64, unsigned_bits: u64 },
+    Unsigned(u64),
+}
+
+impl IntegerValue {
+    fn decimal(self) -> IntegerFormatValue {
+        match self {
+            Self::Signed { decimal, .. } => IntegerFormatValue::Signed(decimal),
+            Self::Unsigned(value) => IntegerFormatValue::Unsigned(value),
+        }
+    }
+
+    fn unsigned_bits(self) -> u64 {
+        match self {
+            Self::Signed { unsigned_bits, .. } => unsigned_bits,
+            Self::Unsigned(value) => value,
+        }
+    }
+
+    fn to_char(self) -> Result<char> {
+        match self {
+            Self::Signed { decimal, .. } => signed_to_char(decimal),
+            Self::Unsigned(value) => unsigned_to_char(value),
+        }
+    }
+}
+
+enum IntegerFormatValue {
+    Signed(i64),
+    Unsigned(u64),
+}
+
+impl std::fmt::Display for IntegerFormatValue {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Signed(value) => write!(f, "{value}"),
+            Self::Unsigned(value) => write!(f, "{value}"),
+        }
     }
 }
 
@@ -940,189 +966,56 @@ impl ConversionSpecifier {
 
                 _ => self.format_boolean(string, value),
             },
-            ScalarValue::Int8(Some(_))
-            | ScalarValue::Int16(Some(_))
-            | ScalarValue::Int32(Some(_))
-            | ScalarValue::Int64(Some(_))
-            | ScalarValue::UInt8(Some(_))
-            | ScalarValue::UInt16(Some(_))
-            | ScalarValue::UInt32(Some(_))
-            | ScalarValue::UInt64(Some(_))
-                if matches!(
-                    self.conversion_type,
-                    ConversionType::CharLower | ConversionType::CharUpper
-                ) =>
-            {
-                self.format_char(string, integer_scalar_to_char(value)?)
+            ScalarValue::Int8(value) => self.format_integer(
+                string,
+                value.map(|value| IntegerValue::Signed {
+                    decimal: value as i64,
+                    unsigned_bits: (value as u8) as u64,
+                }),
+                "Int8",
+            ),
+            ScalarValue::Int16(value) => self.format_integer(
+                string,
+                value.map(|value| IntegerValue::Signed {
+                    decimal: value as i64,
+                    unsigned_bits: (value as u16) as u64,
+                }),
+                "Int16",
+            ),
+            ScalarValue::Int32(value) => self.format_integer(
+                string,
+                value.map(|value| IntegerValue::Signed {
+                    decimal: value as i64,
+                    unsigned_bits: (value as u32) as u64,
+                }),
+                "Int32",
+            ),
+            ScalarValue::Int64(value) => self.format_integer(
+                string,
+                value.map(|value| IntegerValue::Signed {
+                    decimal: value,
+                    unsigned_bits: value as u64,
+                }),
+                "Int64",
+            ),
+            ScalarValue::UInt8(value) => self.format_integer(
+                string,
+                value.map(|value| IntegerValue::Unsigned(value as u64)),
+                "UInt8",
+            ),
+            ScalarValue::UInt16(value) => self.format_integer(
+                string,
+                value.map(|value| IntegerValue::Unsigned(value as u64)),
+                "UInt16",
+            ),
+            ScalarValue::UInt32(value) => self.format_integer(
+                string,
+                value.map(|value| IntegerValue::Unsigned(value as u64)),
+                "UInt32",
+            ),
+            ScalarValue::UInt64(value) => {
+                self.format_integer(string, value.map(IntegerValue::Unsigned), "UInt64")
             }
-            ScalarValue::Int8(value) => match (self.conversion_type, value) {
-                (ConversionType::DecInt, Some(value)) => {
-                    self.format_signed(string, *value as i64)
-                }
-                (
-                    ConversionType::HexIntLower
-                    | ConversionType::HexIntUpper
-                    | ConversionType::OctInt,
-                    Some(value),
-                ) => self.format_unsigned(string, (*value as u8) as u64),
-                (
-                    ConversionType::StringLower | ConversionType::StringUpper,
-                    Some(value),
-                ) => self.format_string(string, &value.to_string()),
-                (t, None) if t.supports_integer() => self.format_string(string, "null"),
-                _ => {
-                    exec_err!(
-                        "Invalid conversion type: {:?} for Int8",
-                        self.conversion_type
-                    )
-                }
-            },
-            ScalarValue::Int16(value) => match (self.conversion_type, value) {
-                (ConversionType::DecInt, Some(value)) => {
-                    self.format_signed(string, *value as i64)
-                }
-                (
-                    ConversionType::HexIntLower
-                    | ConversionType::HexIntUpper
-                    | ConversionType::OctInt,
-                    Some(value),
-                ) => self.format_unsigned(string, (*value as u16) as u64),
-                (
-                    ConversionType::StringLower | ConversionType::StringUpper,
-                    Some(value),
-                ) => self.format_string(string, &value.to_string()),
-                (t, None) if t.supports_integer() => self.format_string(string, "null"),
-                _ => {
-                    exec_err!(
-                        "Invalid conversion type: {:?} for Int16",
-                        self.conversion_type
-                    )
-                }
-            },
-            ScalarValue::Int32(value) => match (self.conversion_type, value) {
-                (ConversionType::DecInt, Some(value)) => {
-                    self.format_signed(string, *value as i64)
-                }
-                (
-                    ConversionType::HexIntLower
-                    | ConversionType::HexIntUpper
-                    | ConversionType::OctInt,
-                    Some(value),
-                ) => self.format_unsigned(string, (*value as u32) as u64),
-                (
-                    ConversionType::StringLower | ConversionType::StringUpper,
-                    Some(value),
-                ) => self.format_string(string, &value.to_string()),
-                (t, None) if t.supports_integer() => self.format_string(string, "null"),
-                _ => {
-                    exec_err!(
-                        "Invalid conversion type: {:?} for Int32",
-                        self.conversion_type
-                    )
-                }
-            },
-            ScalarValue::Int64(value) => match (self.conversion_type, value) {
-                (ConversionType::DecInt, Some(value)) => {
-                    self.format_signed(string, *value)
-                }
-                (
-                    ConversionType::HexIntLower
-                    | ConversionType::HexIntUpper
-                    | ConversionType::OctInt,
-                    Some(value),
-                ) => self.format_unsigned(string, *value as u64),
-                (
-                    ConversionType::StringLower | ConversionType::StringUpper,
-                    Some(value),
-                ) => self.format_string(string, &value.to_string()),
-                (t, None) if t.supports_integer() => self.format_string(string, "null"),
-                _ => {
-                    exec_err!(
-                        "Invalid conversion type: {:?} for Int64",
-                        self.conversion_type
-                    )
-                }
-            },
-            ScalarValue::UInt8(value) => match (self.conversion_type, value) {
-                (
-                    ConversionType::DecInt
-                    | ConversionType::HexIntLower
-                    | ConversionType::HexIntUpper
-                    | ConversionType::OctInt,
-                    Some(value),
-                ) => self.format_unsigned(string, *value as u64),
-                (
-                    ConversionType::StringLower | ConversionType::StringUpper,
-                    Some(value),
-                ) => self.format_string(string, &value.to_string()),
-                (t, None) if t.supports_integer() => self.format_string(string, "null"),
-                _ => {
-                    exec_err!(
-                        "Invalid conversion type: {:?} for UInt8",
-                        self.conversion_type
-                    )
-                }
-            },
-            ScalarValue::UInt16(value) => match (self.conversion_type, value) {
-                (
-                    ConversionType::DecInt
-                    | ConversionType::HexIntLower
-                    | ConversionType::HexIntUpper
-                    | ConversionType::OctInt,
-                    Some(value),
-                ) => self.format_unsigned(string, *value as u64),
-                (
-                    ConversionType::StringLower | ConversionType::StringUpper,
-                    Some(value),
-                ) => self.format_string(string, &value.to_string()),
-                (t, None) if t.supports_integer() => self.format_string(string, "null"),
-                _ => {
-                    exec_err!(
-                        "Invalid conversion type: {:?} for UInt16",
-                        self.conversion_type
-                    )
-                }
-            },
-            ScalarValue::UInt32(value) => match (self.conversion_type, value) {
-                (
-                    ConversionType::DecInt
-                    | ConversionType::HexIntLower
-                    | ConversionType::HexIntUpper
-                    | ConversionType::OctInt,
-                    Some(value),
-                ) => self.format_unsigned(string, *value as u64),
-                (
-                    ConversionType::StringLower | ConversionType::StringUpper,
-                    Some(value),
-                ) => self.format_string(string, &value.to_string()),
-                (t, None) if t.supports_integer() => self.format_string(string, "null"),
-                _ => {
-                    exec_err!(
-                        "Invalid conversion type: {:?} for UInt32",
-                        self.conversion_type
-                    )
-                }
-            },
-            ScalarValue::UInt64(value) => match (self.conversion_type, value) {
-                (
-                    ConversionType::DecInt
-                    | ConversionType::HexIntLower
-                    | ConversionType::HexIntUpper
-                    | ConversionType::OctInt,
-                    Some(value),
-                ) => self.format_unsigned(string, *value),
-                (
-                    ConversionType::StringLower | ConversionType::StringUpper,
-                    Some(value),
-                ) => self.format_string(string, &value.to_string()),
-                (t, None) if t.supports_integer() => self.format_string(string, "null"),
-                _ => {
-                    exec_err!(
-                        "Invalid conversion type: {:?} for UInt64",
-                        self.conversion_type
-                    )
-                }
-            },
             ScalarValue::Float16(value) => match (self.conversion_type, value) {
                 (
                     ConversionType::DecFloatLower
@@ -1484,6 +1377,50 @@ impl ConversionSpecifier {
         }
     }
 
+    fn format_integer(
+        &self,
+        writer: &mut String,
+        value: Option<IntegerValue>,
+        type_name: &str,
+    ) -> Result<()> {
+        let Some(value) = value else {
+            return if self.conversion_type.supports_integer() {
+                self.format_string(writer, "null")
+            } else {
+                exec_err!(
+                    "Invalid conversion type: {:?} for {}",
+                    self.conversion_type,
+                    type_name
+                )
+            };
+        };
+
+        match self.conversion_type {
+            ConversionType::DecInt => match value.decimal() {
+                IntegerFormatValue::Signed(value) => self.format_signed(writer, value),
+                IntegerFormatValue::Unsigned(value) => {
+                    self.format_unsigned(writer, value)
+                }
+            },
+            ConversionType::HexIntLower
+            | ConversionType::HexIntUpper
+            | ConversionType::OctInt => {
+                self.format_unsigned(writer, value.unsigned_bits())
+            }
+            ConversionType::CharLower | ConversionType::CharUpper => {
+                self.format_char(writer, value.to_char()?)
+            }
+            ConversionType::StringLower | ConversionType::StringUpper => {
+                self.format_string(writer, &value.decimal().to_string())
+            }
+            _ => exec_err!(
+                "Invalid conversion type: {:?} for {}",
+                self.conversion_type,
+                type_name
+            ),
+        }
+    }
+
     fn format_hex_float(&self, writer: &mut String, value: f64) -> Result<()> {
         // Handle special cases first
         let (sign, raw_exponent, mantissa) = value.to_parts();
@@ -2588,6 +2525,62 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_integer_formatting_across_widths() -> Result<()> {
+        let cases = [
+            (ScalarValue::Int8(Some(-1)), "%d|%x|%o|%s", "-1|ff|377|-1"),
+            (
+                ScalarValue::Int16(Some(-1)),
+                "%d|%x|%o|%s",
+                "-1|ffff|177777|-1",
+            ),
+            (
+                ScalarValue::Int32(Some(-1)),
+                "%d|%x|%o|%s",
+                "-1|ffffffff|37777777777|-1",
+            ),
+            (
+                ScalarValue::Int64(Some(-1)),
+                "%d|%x|%o|%s",
+                "-1|ffffffffffffffff|1777777777777777777777|-1",
+            ),
+            (
+                ScalarValue::UInt8(Some(255)),
+                "%d|%x|%o|%s|%c",
+                "255|ff|377|255|ÿ",
+            ),
+            (
+                ScalarValue::UInt16(Some(65535)),
+                "%d|%x|%o|%s",
+                "65535|ffff|177777|65535",
+            ),
+            (
+                ScalarValue::UInt32(Some(u32::MAX)),
+                "%d|%x|%o|%s",
+                "4294967295|ffffffff|37777777777|4294967295",
+            ),
+            (
+                ScalarValue::UInt64(Some(u64::MAX)),
+                "%d|%x|%o|%s",
+                "18446744073709551615|ffffffffffffffff|1777777777777777777777|18446744073709551615",
+            ),
+            (
+                ScalarValue::Int32(None),
+                "%d|%x|%o|%s|%c",
+                "null|null|null|null|null",
+            ),
+        ];
+
+        for (value, fmt, expected) in cases {
+            let data_types =
+                vec![value.data_type(); fmt.chars().filter(|c| *c == '%').count()];
+            let formatter = Formatter::parse(fmt, &data_types)?;
+            let args = vec![value; data_types.len()];
+            assert_eq!(formatter.format(&args)?, expected, "{fmt}");
+        }
+        Ok(())
+    }
+
     #[test]
     fn test_insert_thousands_separator() {
         assert_eq!(insert_thousands_separator("1234567.89"), "1,234,567.89");

From bd07a7c2995eb1128b909ca0ed682ebf58dc169a Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 20 May 2026 16:41:16 +0800
Subject: [PATCH 2/3] feat: Refactor integer value handling

- Removed IntegerFormatValue.
- Added direct IntegerValue helpers.
- Introduced private From implementations via local macros.
- Shortened integer ScalarValue arms for clarity.
- Deduplicated invalid integer conversion error handling.
- Made test argument counts explicit for better readability.
---
 .../src/function/string/format_string.rs      | 182 +++++++++---------
 1 file changed, 90 insertions(+), 92 deletions(-)

diff --git a/datafusion/spark/src/function/string/format_string.rs b/datafusion/spark/src/function/string/format_string.rs
index 2cec1790e69b3..8af7bc1098338 100644
--- a/datafusion/spark/src/function/string/format_string.rs
+++ b/datafusion/spark/src/function/string/format_string.rs
@@ -898,13 +898,6 @@ enum IntegerValue {
 }
 
 impl IntegerValue {
-    fn decimal(self) -> IntegerFormatValue {
-        match self {
-            Self::Signed { decimal, .. } => IntegerFormatValue::Signed(decimal),
-            Self::Unsigned(value) => IntegerFormatValue::Unsigned(value),
-        }
-    }
-
     fn unsigned_bits(self) -> u64 {
         match self {
             Self::Signed { unsigned_bits, .. } => unsigned_bits,
@@ -918,22 +911,59 @@ impl IntegerValue {
             Self::Unsigned(value) => unsigned_to_char(value),
         }
     }
-}
 
-enum IntegerFormatValue {
-    Signed(i64),
-    Unsigned(u64),
-}
+    fn format_decimal(
+        self,
+        spec: &ConversionSpecifier,
+        writer: &mut String,
+    ) -> Result<()> {
+        match self {
+            Self::Signed { decimal, .. } => spec.format_signed(writer, decimal),
+            Self::Unsigned(value) => spec.format_unsigned(writer, value),
+        }
+    }
 
-impl std::fmt::Display for IntegerFormatValue {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    fn decimal_string(self) -> String {
         match self {
-            Self::Signed(value) => write!(f, "{value}"),
-            Self::Unsigned(value) => write!(f, "{value}"),
+            Self::Signed { decimal, .. } => decimal.to_string(),
+            Self::Unsigned(value) => value.to_string(),
         }
     }
 }
 
+macro_rules! signed_integer_value {
+    ($source:ty, $unsigned:ty) => {
+        impl From<$source> for IntegerValue {
+            fn from(value: $source) -> Self {
+                Self::Signed {
+                    decimal: value as i64,
+                    unsigned_bits: (value as $unsigned) as u64,
+                }
+            }
+        }
+    };
+}
+
+signed_integer_value!(i8, u8);
+signed_integer_value!(i16, u16);
+signed_integer_value!(i32, u32);
+signed_integer_value!(i64, u64);
+
+macro_rules! unsigned_integer_value {
+    ($source:ty) => {
+        impl From<$source> for IntegerValue {
+            fn from(value: $source) -> Self {
+                Self::Unsigned(value as u64)
+            }
+        }
+    };
+}
+
+unsigned_integer_value!(u8);
+unsigned_integer_value!(u16);
+unsigned_integer_value!(u32);
+unsigned_integer_value!(u64);
+
 impl ConversionSpecifier {
     /// Validates that the grouping separator flag is not used with scientific
     /// notation conversions, matching Java/Spark behavior which throws
@@ -966,56 +996,14 @@ impl ConversionSpecifier {
 
                 _ => self.format_boolean(string, value),
             },
-            ScalarValue::Int8(value) => self.format_integer(
-                string,
-                value.map(|value| IntegerValue::Signed {
-                    decimal: value as i64,
-                    unsigned_bits: (value as u8) as u64,
-                }),
-                "Int8",
-            ),
-            ScalarValue::Int16(value) => self.format_integer(
-                string,
-                value.map(|value| IntegerValue::Signed {
-                    decimal: value as i64,
-                    unsigned_bits: (value as u16) as u64,
-                }),
-                "Int16",
-            ),
-            ScalarValue::Int32(value) => self.format_integer(
-                string,
-                value.map(|value| IntegerValue::Signed {
-                    decimal: value as i64,
-                    unsigned_bits: (value as u32) as u64,
-                }),
-                "Int32",
-            ),
-            ScalarValue::Int64(value) => self.format_integer(
-                string,
-                value.map(|value| IntegerValue::Signed {
-                    decimal: value,
-                    unsigned_bits: value as u64,
-                }),
-                "Int64",
-            ),
-            ScalarValue::UInt8(value) => self.format_integer(
-                string,
-                value.map(|value| IntegerValue::Unsigned(value as u64)),
-                "UInt8",
-            ),
-            ScalarValue::UInt16(value) => self.format_integer(
-                string,
-                value.map(|value| IntegerValue::Unsigned(value as u64)),
-                "UInt16",
-            ),
-            ScalarValue::UInt32(value) => self.format_integer(
-                string,
-                value.map(|value| IntegerValue::Unsigned(value as u64)),
-                "UInt32",
-            ),
-            ScalarValue::UInt64(value) => {
-                self.format_integer(string, value.map(IntegerValue::Unsigned), "UInt64")
-            }
+            ScalarValue::Int8(value) => self.format_integer(string, value, "Int8"),
+            ScalarValue::Int16(value) => self.format_integer(string, value, "Int16"),
+            ScalarValue::Int32(value) => self.format_integer(string, value, "Int32"),
+            ScalarValue::Int64(value) => self.format_integer(string, value, "Int64"),
+            ScalarValue::UInt8(value) => self.format_integer(string, value, "UInt8"),
+            ScalarValue::UInt16(value) => self.format_integer(string, value, "UInt16"),
+            ScalarValue::UInt32(value) => self.format_integer(string, value, "UInt32"),
+            ScalarValue::UInt64(value) => self.format_integer(string, value, "UInt64"),
             ScalarValue::Float16(value) => match (self.conversion_type, value) {
                 (
                     ConversionType::DecFloatLower
@@ -1377,31 +1365,25 @@ impl ConversionSpecifier {
         }
     }
 
-    fn format_integer(
+    fn format_integer<T>(
         &self,
         writer: &mut String,
-        value: Option<IntegerValue>,
+        value: &Option<T>,
         type_name: &str,
-    ) -> Result<()> {
-        let Some(value) = value else {
+    ) -> Result<()>
+    where
+        T: Copy + Into<IntegerValue>,
+    {
+        let Some(value) = value.map(Into::into) else {
             return if self.conversion_type.supports_integer() {
                 self.format_string(writer, "null")
             } else {
-                exec_err!(
-                    "Invalid conversion type: {:?} for {}",
-                    self.conversion_type,
-                    type_name
-                )
+                self.invalid_integer_conversion(type_name)
             };
         };
 
         match self.conversion_type {
-            ConversionType::DecInt => match value.decimal() {
-                IntegerFormatValue::Signed(value) => self.format_signed(writer, value),
-                IntegerFormatValue::Unsigned(value) => {
-                    self.format_unsigned(writer, value)
-                }
-            },
+            ConversionType::DecInt => value.format_decimal(self, writer),
             ConversionType::HexIntLower
             | ConversionType::HexIntUpper
             | ConversionType::OctInt => {
@@ -1411,16 +1393,20 @@ impl ConversionSpecifier {
                 self.format_char(writer, value.to_char()?)
             }
             ConversionType::StringLower | ConversionType::StringUpper => {
-                self.format_string(writer, &value.decimal().to_string())
+                self.format_string(writer, &value.decimal_string())
             }
-            _ => exec_err!(
-                "Invalid conversion type: {:?} for {}",
-                self.conversion_type,
-                type_name
-            ),
+            _ => self.invalid_integer_conversion(type_name),
         }
     }
 
+    fn invalid_integer_conversion<T>(&self, type_name: &str) -> Result<T> {
+        exec_err!(
+            "Invalid conversion type: {:?} for {}",
+            self.conversion_type,
+            type_name
+        )
+    }
+
     fn format_hex_float(&self, writer: &mut String, value: f64) -> Result<()> {
         // Handle special cases first
         let (sign, raw_exponent, mantissa) = value.to_parts();
@@ -2528,54 +2514,66 @@ mod tests {
     #[test]
     fn test_integer_formatting_across_widths() -> Result<()> {
         let cases = [
-            (ScalarValue::Int8(Some(-1)), "%d|%x|%o|%s", "-1|ff|377|-1"),
+            (
+                ScalarValue::Int8(Some(-1)),
+                "%d|%x|%o|%s",
+                4,
+                "-1|ff|377|-1",
+            ),
             (
                 ScalarValue::Int16(Some(-1)),
                 "%d|%x|%o|%s",
+                4,
                 "-1|ffff|177777|-1",
             ),
             (
                 ScalarValue::Int32(Some(-1)),
                 "%d|%x|%o|%s",
+                4,
                 "-1|ffffffff|37777777777|-1",
             ),
             (
                 ScalarValue::Int64(Some(-1)),
                 "%d|%x|%o|%s",
+                4,
                 "-1|ffffffffffffffff|1777777777777777777777|-1",
             ),
             (
                 ScalarValue::UInt8(Some(255)),
                 "%d|%x|%o|%s|%c",
+                5,
                 "255|ff|377|255|ÿ",
             ),
             (
                 ScalarValue::UInt16(Some(65535)),
                 "%d|%x|%o|%s",
+                4,
                 "65535|ffff|177777|65535",
             ),
             (
                 ScalarValue::UInt32(Some(u32::MAX)),
                 "%d|%x|%o|%s",
+                4,
                 "4294967295|ffffffff|37777777777|4294967295",
             ),
             (
                 ScalarValue::UInt64(Some(u64::MAX)),
                 "%d|%x|%o|%s",
+                4,
                 "18446744073709551615|ffffffffffffffff|1777777777777777777777|18446744073709551615",
             ),
             (
                 ScalarValue::Int32(None),
                 "%d|%x|%o|%s|%c",
+                5,
                 "null|null|null|null|null",
             ),
         ];
 
-        for (value, fmt, expected) in cases {
-            let data_types =
-                vec![value.data_type(); fmt.chars().filter(|c| *c == '%').count()];
+        for (value, fmt, arg_count, expected) in cases {
+            let data_types = vec![value.data_type(); arg_count];
             let formatter = Formatter::parse(fmt, &data_types)?;
-            let args = vec![value; data_types.len()];
+            let args = vec![value; arg_count];
             assert_eq!(formatter.format(&args)?, expected, "{fmt}");
         }
         Ok(())

From 08209643134b6f510d372916bc15b717aa24a55d Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 20 May 2026 17:16:33 +0800
Subject: [PATCH 3/3] docs: add IntegerValue invariant doc comment in
 format_string.rs

---
 datafusion/spark/src/function/string/format_string.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/datafusion/spark/src/function/string/format_string.rs b/datafusion/spark/src/function/string/format_string.rs
index 8af7bc1098338..30bc2eacf7a76 100644
--- a/datafusion/spark/src/function/string/format_string.rs
+++ b/datafusion/spark/src/function/string/format_string.rs
@@ -891,6 +891,9 @@ fn unsigned_to_char(value: u64) -> Result<char> {
     codepoint_to_char(codepoint)
 }
 
+/// Normalizes integer scalar payloads while preserving Spark formatting semantics:
+/// signed values format as decimal for `%d` / `%s` / `%c`, but use their original
+/// bit width for `%x` / `%o` via `unsigned_bits`.
 #[derive(Debug, Clone, Copy)]
 enum IntegerValue {
     Signed { decimal: i64, unsigned_bits: u64 },