From a7f4feb484e80c74a6c92ad01c0f661642825b1b Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Fri, 29 May 2026 15:39:57 +0200 Subject: [PATCH 1/4] [SPARK-57032][SQL] Extend timestamp string parsing for nanosecond fractional precision ### What changes were proposed in this pull request? Extend `SparkDateTimeUtils.parseTimestampString` to preserve fractional-second digits 7-9 in a new output-only slot `segments(9)` (sub-microsecond remainder in [0, 999]), while keeping `segments(6)` as microseconds so all existing callers are unaffected. Add package-private parse entry points that return a normalized `TimestampNanosVal` for `TIMESTAMP_NTZ(p)`/`TIMESTAMP_LTZ(p)` with `p` in [7, 9]: `stringToTimestampNTZNanos`, `stringToTimestampLTZNanos`, and their ANSI variants. Fractional digits beyond the target precision `p` are truncated toward zero, consistent with the existing microsecond parsing behavior. ### Why are the changes needed? This is the first sub-task of the nanosecond datetime conversion utilities under SPARK-56822 (SPIP: Timestamps with nanosecond precision). Without it, timestamp strings with 7-9 fractional digits cannot be converted to the nanosecond-capable composite representation (epochMicros + nanosWithinMicro). ### Does this PR introduce any user-facing change? No. Existing `TimestampType`/`TimestampNTZType` string parsing is unchanged; the new parse APIs are package-private and not yet wired to user-facing casts. ### How was this patch tested? Added `TimestampNanosParseSuite` covering 7/8/9-digit fractions, per-precision truncation, NTZ/LTZ, zone suffixes, range edge cases, and ANSI errors. Verified existing `DateTimeUtilsSuite` and `TimestampFormatterSuite` still pass. --- .../catalyst/util/SparkDateTimeUtils.scala | 138 +++++++++++++- .../util/TimestampNanosParseSuite.scala | 173 ++++++++++++++++++ 2 files changed, 305 insertions(+), 6 deletions(-) create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampNanosParseSuite.scala diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala index 9684737a22865..4f04e827ab522 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala @@ -30,8 +30,8 @@ import org.apache.spark.QueryContext import org.apache.spark.sql.catalyst.util.DateTimeConstants._ import org.apache.spark.sql.catalyst.util.RebaseDateTime.{rebaseGregorianToJulianDays, rebaseGregorianToJulianMicros, rebaseJulianToGregorianDays, rebaseJulianToGregorianMicros} import org.apache.spark.sql.errors.ExecutionErrors -import org.apache.spark.sql.types.{DateType, TimestampType, TimeType} -import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.sql.types.{DateType, TimestampLTZNanosType, TimestampNTZNanosType, TimestampType, TimeType} +import org.apache.spark.unsafe.types.{TimestampNanosVal, UTF8String} import org.apache.spark.util.SparkClassUtils trait SparkDateTimeUtils { @@ -491,6 +491,11 @@ trait SparkDateTimeUtils { * - +|-hhmmss * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` * + * Up to 9 fractional-second digits are accepted. Digits 1-6 are kept as microseconds in + * `segments(6)` (backward-compatible micro behavior), digits 7-9 are kept as the + * sub-microsecond remainder in `segments(9)` (a value in [0, 999]), and digits beyond the 9th + * are dropped. + * * @return * timestamp segments, time zone id and whether the input is just time without a date. If the * input string can't be parsed as timestamp, the result timestamp segments are empty. @@ -509,7 +514,11 @@ trait SparkDateTimeUtils { return (Array.empty, None, false) } var tz: Option[String] = None - val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0) + // Indices 0-6 hold year, month, day, hour, minute, second and the microsecond part of the + // fractional second (digits 1-6). Index 9 is an output-only slot that holds the + // sub-microsecond remainder (fractional digits 7-9) as a value in [0, 999]; it is not touched + // by the parsing loop below. Indices 7-8 are used while validating a region-based zone id. + val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0, 0) var i = 0 var currentSegmentValue = 0 var currentSegmentDigits = 0 @@ -522,6 +531,7 @@ trait SparkDateTimeUtils { } var digitsMilli = 0 + var nanosWithinMicro = 0 var justTime = false var yearSign: Option[Int] = None if (bytes(j) == '-' || bytes(j) == '+') { @@ -604,7 +614,9 @@ trait SparkDateTimeUtils { i += 1 } } else { - if (i < segments.length && (b == ':' || b == ' ')) { + // Bound is fixed at 9 (the original number of parsed segments) so that the trailing + // output-only slot segments(9) is never written by the parsing loop. + if (i < 9 && (b == ':' || b == ' ')) { if (!isValidDigits(i, currentSegmentDigits)) { return (Array.empty, None, false) } @@ -620,10 +632,13 @@ trait SparkDateTimeUtils { if (i == 6) { digitsMilli += 1 } - // We will truncate the nanosecond part if there are more than 6 digits, which results - // in loss of precision if (i != 6 || currentSegmentDigits < 6) { + // Fractional digits 1-6 form the microsecond part stored in segments(6). currentSegmentValue = currentSegmentValue * 10 + parsedValue + } else if (currentSegmentDigits < 9) { + // Fractional digits 7-9 are retained as the sub-microsecond remainder. Digits beyond + // the 9th are dropped (loss of precision below the nanosecond grid). + nanosWithinMicro = nanosWithinMicro * 10 + parsedValue } currentSegmentDigits += 1 } @@ -640,6 +655,17 @@ trait SparkDateTimeUtils { digitsMilli += 1 } + // Right-pad the captured sub-microsecond digits (the 7th to 9th fractional digits) so that + // segments(9) always holds a value in [0, 999]. The number of captured digits is + // clamp(digitsMilli - 6, 0, 3); fewer captured digits means the remainder is left-aligned and + // must be scaled up (e.g. ".0000001" -> 100, ".00000012" -> 120, ".000000123" -> 123). + var subMicroDigits = math.max(0, math.min(digitsMilli, 9) - 6) + while (subMicroDigits < 3) { + nanosWithinMicro *= 10 + subMicroDigits += 1 + } + segments(9) = nanosWithinMicro + // This step also validates time zone part val zoneId = tz.map(zoneName => getZoneId(zoneName.trim)) segments(0) *= yearSign.getOrElse(1) @@ -713,6 +739,106 @@ trait SparkDateTimeUtils { } } + /** + * Truncates the sub-microsecond remainder (`segments(9)`, a value in [0, 999]) to the given + * fractional-second `precision`. Since microseconds occupy fractional digits 1-6, a `precision` + * in [7, 9] only affects the sub-microsecond digits: digits beyond `precision` are dropped + * (truncation toward zero, consistent with the microsecond parsing path). + */ + private def truncateNanosWithinMicro(nanosWithinMicro: Int, precision: Int): Short = { + val factor = precision match { + case 7 => 100 + case 8 => 10 + case _ => 1 + } + ((nanosWithinMicro / factor) * factor).toShort + } + + /** + * Trims and parses a given UTF8 string into a [[TimestampNanosVal]] (epoch microseconds plus a + * sub-microsecond remainder in [0, 999]) for `TIMESTAMP_LTZ(precision)` with `precision` in + * [7, 9]. Fractional digits beyond `precision` are truncated. The return type is [[Option]] in + * order to distinguish between a valid zero value and null. Please refer to + * `parseTimestampString` for the allowed formats. + */ + def stringToTimestampLTZNanos( + s: UTF8String, + precision: Int, + timeZoneId: ZoneId): Option[TimestampNanosVal] = { + try { + val (segments, parsedZoneId, justTime) = parseTimestampString(s) + if (segments.isEmpty) { + return None + } + val zoneId = parsedZoneId.getOrElse(timeZoneId) + val nanoseconds = MICROSECONDS.toNanos(segments(6)) + val localTime = LocalTime.of(segments(3), segments(4), segments(5), nanoseconds.toInt) + val localDate = if (justTime) { + LocalDate.now(zoneId) + } else { + LocalDate.of(segments(0), segments(1), segments(2)) + } + val localDateTime = LocalDateTime.of(localDate, localTime) + val zonedDateTime = ZonedDateTime.of(localDateTime, zoneId) + val instant = Instant.from(zonedDateTime) + val epochMicros = instantToMicros(instant) + Some(TimestampNanosVal.fromParts( + epochMicros, truncateNanosWithinMicro(segments(9), precision))) + } catch { + case NonFatal(_) => None + } + } + + def stringToTimestampLTZNanosAnsi( + s: UTF8String, + precision: Int, + timeZoneId: ZoneId, + context: QueryContext = null): TimestampNanosVal = { + stringToTimestampLTZNanos(s, precision, timeZoneId).getOrElse { + throw ExecutionErrors.invalidInputInCastToDatetimeError( + s, TimestampLTZNanosType(precision), context) + } + } + + /** + * Trims and parses a given UTF8 string into a [[TimestampNanosVal]] (epoch microseconds plus a + * sub-microsecond remainder in [0, 999]) for `TIMESTAMP_NTZ(precision)` with `precision` in + * [7, 9]. Fractional digits beyond `precision` are truncated. The result is independent of time + * zones; a time zone component is discarded when `allowTimeZone` is `true` and rejected (returns + * `None`) otherwise. The return type is [[Option]] in order to distinguish between a valid zero + * value and null. Please refer to `parseTimestampString` for the allowed formats. + */ + def stringToTimestampNTZNanos( + s: UTF8String, + precision: Int, + allowTimeZone: Boolean = true): Option[TimestampNanosVal] = { + try { + val (segments, zoneIdOpt, justTime) = parseTimestampString(s) + if (segments.isEmpty || justTime || !allowTimeZone && zoneIdOpt.isDefined) { + return None + } + val nanoseconds = MICROSECONDS.toNanos(segments(6)) + val localTime = LocalTime.of(segments(3), segments(4), segments(5), nanoseconds.toInt) + val localDate = LocalDate.of(segments(0), segments(1), segments(2)) + val localDateTime = LocalDateTime.of(localDate, localTime) + val epochMicros = localDateTimeToMicros(localDateTime) + Some(TimestampNanosVal.fromParts( + epochMicros, truncateNanosWithinMicro(segments(9), precision))) + } catch { + case NonFatal(_) => None + } + } + + def stringToTimestampNTZNanosAnsi( + s: UTF8String, + precision: Int, + context: QueryContext = null): TimestampNanosVal = { + stringToTimestampNTZNanos(s, precision).getOrElse { + throw ExecutionErrors.invalidInputInCastToDatetimeError( + s, TimestampNTZNanosType(precision), context) + } + } + /** * Trims and parses a given UTF8 string to a corresponding [[Long]] value which representing the * number of microseconds since the midnight. The result will be independent of time zones. diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampNanosParseSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampNanosParseSuite.scala new file mode 100644 index 0000000000000..bba3ff576a5fa --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampNanosParseSuite.scala @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util + +import java.time.{ZoneId, ZoneOffset} + +import org.apache.spark.{SparkDateTimeException, SparkFunSuite} +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ +import org.apache.spark.sql.catalyst.util.DateTimeUtils._ +import org.apache.spark.unsafe.types.{TimestampNanosVal, UTF8String} + +/** + * Tests for string-to-nanosecond timestamp parsing added under SPARK-57032. The parser keeps the + * microsecond part (fractional digits 1-6) and the sub-microsecond remainder (digits 7-9, in + * [0, 999]) and applies the target fractional precision `p` in [7, 9] by truncating extra digits. + */ +class TimestampNanosParseSuite extends SparkFunSuite { + + private val losAngeles = getZoneId("America/Los_Angeles") + + private def ntz( + str: String, + precision: Int, + allowTimeZone: Boolean = true): Option[TimestampNanosVal] = { + stringToTimestampNTZNanos(UTF8String.fromString(str), precision, allowTimeZone) + } + + private def ltz(str: String, precision: Int, zoneId: ZoneId): Option[TimestampNanosVal] = { + stringToTimestampLTZNanos(UTF8String.fromString(str), precision, zoneId) + } + + test("NTZ: fractional digits 7-9 are preserved as nanosWithinMicro") { + assert(ntz("2015-01-02 00:00:00.123456789", 9).get === + TimestampNanosVal.fromParts(date(2015, 1, 2, 0, 0, 0, 123456, ZoneOffset.UTC), 789.toShort)) + assert(ntz("2015-01-02 00:00:00.1234567", 9).get === + TimestampNanosVal.fromParts(date(2015, 1, 2, 0, 0, 0, 123456, ZoneOffset.UTC), 700.toShort)) + assert(ntz("2015-01-02 00:00:00.12345678", 9).get === + TimestampNanosVal.fromParts(date(2015, 1, 2, 0, 0, 0, 123456, ZoneOffset.UTC), 780.toShort)) + } + + test("NTZ: precision truncates excess sub-microsecond digits toward zero") { + val micros = date(2020, 12, 31, 23, 59, 59, 123456, ZoneOffset.UTC) + assert(ntz("2020-12-31 23:59:59.123456789", 9).get === + TimestampNanosVal.fromParts(micros, 789.toShort)) + assert(ntz("2020-12-31 23:59:59.123456789", 8).get === + TimestampNanosVal.fromParts(micros, 780.toShort)) + assert(ntz("2020-12-31 23:59:59.123456789", 7).get === + TimestampNanosVal.fromParts(micros, 700.toShort)) + } + + test("NTZ: digits beyond the 9th are dropped") { + val expected = TimestampNanosVal.fromParts( + date(2020, 12, 31, 23, 59, 59, 123456, ZoneOffset.UTC), 789.toShort) + assert(ntz("2020-12-31 23:59:59.1234567890", 9).get === expected) + assert(ntz("2020-12-31 23:59:59.123456789999", 9).get === expected) + } + + test("NTZ: fewer than 6 fractional digits yield zero nanosWithinMicro") { + assert(ntz("2020-01-01 00:00:00.0", 9).get === + TimestampNanosVal.fromParts(date(2020, 1, 1, 0, 0, 0, 0, ZoneOffset.UTC), 0.toShort)) + assert(ntz("2020-01-01 00:00:00.1", 9).get === + TimestampNanosVal.fromParts(date(2020, 1, 1, 0, 0, 0, 100000, ZoneOffset.UTC), 0.toShort)) + assert(ntz("2020-01-01 00:00:00.123456", 9).get === + TimestampNanosVal.fromParts(date(2020, 1, 1, 0, 0, 0, 123456, ZoneOffset.UTC), 0.toShort)) + } + + test("NTZ: trailing zeros in the sub-microsecond part") { + assert(ntz("2015-01-02 00:00:00.000050000", 9).get === + TimestampNanosVal.fromParts(date(2015, 1, 2, 0, 0, 0, 50, ZoneOffset.UTC), 0.toShort)) + assert(ntz("2015-01-02 00:00:00.100000009", 9).get === + TimestampNanosVal.fromParts(date(2015, 1, 2, 0, 0, 0, 100000, ZoneOffset.UTC), 9.toShort)) + } + + test("NTZ: maximum and minimum sub-microsecond fractions") { + assert(ntz("2020-06-15 12:00:00.999999999", 9).get === + TimestampNanosVal.fromParts(date(2020, 6, 15, 12, 0, 0, 999999, ZoneOffset.UTC), 999.toShort)) + assert(ntz("2020-06-15 12:00:00.000000001", 9).get === + TimestampNanosVal.fromParts(date(2020, 6, 15, 12, 0, 0, 0, ZoneOffset.UTC), 1.toShort)) + // ".000000001" loses its only sub-micro digit at precision 8 and 7. + assert(ntz("2020-06-15 12:00:00.000000001", 8).get.nanosWithinMicro === 0.toShort) + assert(ntz("2020-06-15 12:00:00.000000001", 7).get.nanosWithinMicro === 0.toShort) + } + + test("NTZ: time zone component is discarded or rejected based on allowTimeZone") { + // With allowTimeZone = true (default) the zone suffix is discarded. + assert(ntz("2015-03-18T12:03:17.123456789Z", 9).get === + TimestampNanosVal.fromParts( + date(2015, 3, 18, 12, 3, 17, 123456, ZoneOffset.UTC), 789.toShort)) + // With allowTimeZone = false a zone suffix makes the input invalid. + assert(ntz("2015-03-18T12:03:17.123456789Z", 9, allowTimeZone = false).isEmpty) + // A time-only input cannot be parsed as TIMESTAMP_NTZ. + assert(ntz("12:03:17.123456789", 9).isEmpty) + } + + test("LTZ: explicit zone offset in the string") { + val expected = TimestampNanosVal.fromParts( + date(2015, 3, 18, 12, 3, 17, 123456, getZoneId("+07:00")), 789.toShort) + assert(ltz("2015-03-18T12:03:17.123456789+07:00", 9, ZoneOffset.UTC).get === expected) + } + + test("LTZ: region-based zone in the string") { + val expected = TimestampNanosVal.fromParts( + date(2015, 3, 18, 12, 3, 17, 123456, getZoneId("Europe/Moscow")), 789.toShort) + assert(ltz("2015-03-18T12:03:17.123456789 Europe/Moscow", 9, ZoneOffset.UTC).get === expected) + } + + test("LTZ: falls back to the session zone when the string has no zone") { + val expected = TimestampNanosVal.fromParts( + date(2015, 3, 18, 12, 3, 17, 123456, losAngeles), 789.toShort) + assert(ltz("2015-03-18 12:03:17.123456789", 9, losAngeles).get === expected) + } + + test("LTZ: precision truncation matches the NTZ path") { + val micros = date(2015, 3, 18, 12, 3, 17, 123456, ZoneOffset.UTC) + assert(ltz("2015-03-18T12:03:17.123456789Z", 7, ZoneOffset.UTC).get === + TimestampNanosVal.fromParts(micros, 700.toShort)) + assert(ltz("2015-03-18T12:03:17.123456789Z", 8, ZoneOffset.UTC).get === + TimestampNanosVal.fromParts(micros, 780.toShort)) + } + + test("range edge cases with sub-microsecond fractions") { + // Unix epoch. + assert(ntz("1970-01-01 00:00:00.000000001", 9).get === + TimestampNanosVal.fromParts(0L, 1.toShort)) + // Julian/Gregorian cutover. + assert(ntz("1582-10-15 00:00:00.123456789", 9).get === + TimestampNanosVal.fromParts(date(1582, 10, 15, 0, 0, 0, 123456, ZoneOffset.UTC), 789.toShort)) + // End of the supported range. + assert(ntz("9999-12-31 23:59:59.999999999", 9).get === + TimestampNanosVal.fromParts( + date(9999, 12, 31, 23, 59, 59, 999999, ZoneOffset.UTC), 999.toShort)) + } + + test("invalid inputs return None") { + assert(ntz("not a timestamp", 9).isEmpty) + assert(ntz("", 9).isEmpty) + assert(ltz("2015-13-40 99:99:99.123456789", 9, ZoneOffset.UTC).isEmpty) + } + + test("ANSI variants throw on invalid input") { + val ntzValid = stringToTimestampNTZNanosAnsi( + UTF8String.fromString("2015-01-02 00:00:00.123456789"), 9) + assert(ntzValid === + TimestampNanosVal.fromParts(date(2015, 1, 2, 0, 0, 0, 123456, ZoneOffset.UTC), 789.toShort)) + + val ltzValid = stringToTimestampLTZNanosAnsi( + UTF8String.fromString("2015-01-02 00:00:00.123456789Z"), 9, ZoneOffset.UTC) + assert(ltzValid === + TimestampNanosVal.fromParts(date(2015, 1, 2, 0, 0, 0, 123456, ZoneOffset.UTC), 789.toShort)) + + intercept[SparkDateTimeException] { + stringToTimestampNTZNanosAnsi(UTF8String.fromString("invalid"), 9) + } + intercept[SparkDateTimeException] { + stringToTimestampLTZNanosAnsi(UTF8String.fromString("invalid"), 9, ZoneOffset.UTC) + } + } +} From 95f7e9edce822cd336e2fbe895a40cd24ac74944 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Fri, 29 May 2026 15:42:37 +0200 Subject: [PATCH 2/4] Fix coding style --- .../catalyst/util/SparkDateTimeUtils.scala | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala index 4f04e827ab522..0bab4c75184d2 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala @@ -756,8 +756,8 @@ trait SparkDateTimeUtils { /** * Trims and parses a given UTF8 string into a [[TimestampNanosVal]] (epoch microseconds plus a - * sub-microsecond remainder in [0, 999]) for `TIMESTAMP_LTZ(precision)` with `precision` in - * [7, 9]. Fractional digits beyond `precision` are truncated. The return type is [[Option]] in + * sub-microsecond remainder in [0, 999]) for `TIMESTAMP_LTZ(precision)` with `precision` in [7, + * 9]. Fractional digits beyond `precision` are truncated. The return type is [[Option]] in * order to distinguish between a valid zero value and null. Please refer to * `parseTimestampString` for the allowed formats. */ @@ -782,8 +782,9 @@ trait SparkDateTimeUtils { val zonedDateTime = ZonedDateTime.of(localDateTime, zoneId) val instant = Instant.from(zonedDateTime) val epochMicros = instantToMicros(instant) - Some(TimestampNanosVal.fromParts( - epochMicros, truncateNanosWithinMicro(segments(9), precision))) + Some( + TimestampNanosVal + .fromParts(epochMicros, truncateNanosWithinMicro(segments(9), precision))) } catch { case NonFatal(_) => None } @@ -796,17 +797,19 @@ trait SparkDateTimeUtils { context: QueryContext = null): TimestampNanosVal = { stringToTimestampLTZNanos(s, precision, timeZoneId).getOrElse { throw ExecutionErrors.invalidInputInCastToDatetimeError( - s, TimestampLTZNanosType(precision), context) + s, + TimestampLTZNanosType(precision), + context) } } /** * Trims and parses a given UTF8 string into a [[TimestampNanosVal]] (epoch microseconds plus a - * sub-microsecond remainder in [0, 999]) for `TIMESTAMP_NTZ(precision)` with `precision` in - * [7, 9]. Fractional digits beyond `precision` are truncated. The result is independent of time - * zones; a time zone component is discarded when `allowTimeZone` is `true` and rejected (returns - * `None`) otherwise. The return type is [[Option]] in order to distinguish between a valid zero - * value and null. Please refer to `parseTimestampString` for the allowed formats. + * sub-microsecond remainder in [0, 999]) for `TIMESTAMP_NTZ(precision)` with `precision` in [7, + * 9]. Fractional digits beyond `precision` are truncated. The result is independent of time + * zones; a time zone component is discarded when `allowTimeZone` is `true` and rejected + * (returns `None`) otherwise. The return type is [[Option]] in order to distinguish between a + * valid zero value and null. Please refer to `parseTimestampString` for the allowed formats. */ def stringToTimestampNTZNanos( s: UTF8String, @@ -822,8 +825,9 @@ trait SparkDateTimeUtils { val localDate = LocalDate.of(segments(0), segments(1), segments(2)) val localDateTime = LocalDateTime.of(localDate, localTime) val epochMicros = localDateTimeToMicros(localDateTime) - Some(TimestampNanosVal.fromParts( - epochMicros, truncateNanosWithinMicro(segments(9), precision))) + Some( + TimestampNanosVal + .fromParts(epochMicros, truncateNanosWithinMicro(segments(9), precision))) } catch { case NonFatal(_) => None } @@ -835,7 +839,9 @@ trait SparkDateTimeUtils { context: QueryContext = null): TimestampNanosVal = { stringToTimestampNTZNanos(s, precision).getOrElse { throw ExecutionErrors.invalidInputInCastToDatetimeError( - s, TimestampNTZNanosType(precision), context) + s, + TimestampNTZNanosType(precision), + context) } } From 73bd3befcb9d179406022126262fce9597ede915 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Fri, 29 May 2026 16:16:03 +0200 Subject: [PATCH 3/4] Address review findings from SPARK-57032 nanos string parsing - Fix stale `isValidDigits` comment (digits 7-9 are now retained, not truncated) - Clarify segments(7-8) comment: values are written by loop as `i` advances but never read by any caller - Extend format-string examples in `parseTimestampString` Scaladoc to show the optional [ns][ns][ns] digits - Add precision guard (throws SparkException.internalError) before the try/catch in stringToTimestampLTZNanos and stringToTimestampNTZNanos, and explicit case 9 + error fallback in truncateNanosWithinMicro - Add Scaladoc to stringToTimestampNTZNanosAnsi noting that allowTimeZone defaults to true (TZ suffix is discarded, not rejected) - New tests: null input, time-only LTZ, pre-epoch negative timestamps, out-of-range precision (checkError / INTERNAL_ERROR), ANSI NTZ TZ-discard Co-authored-by: Isaac --- .../catalyst/util/SparkDateTimeUtils.scala | 35 +++++++++---- .../util/TimestampNanosParseSuite.scala | 50 ++++++++++++++++++- 2 files changed, 75 insertions(+), 10 deletions(-) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala index 0bab4c75184d2..b5961268b7e04 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala @@ -26,7 +26,7 @@ import java.util.regex.Pattern import scala.util.control.NonFatal -import org.apache.spark.QueryContext +import org.apache.spark.{QueryContext, SparkException} import org.apache.spark.sql.catalyst.util.DateTimeConstants._ import org.apache.spark.sql.catalyst.util.RebaseDateTime.{rebaseGregorianToJulianDays, rebaseGregorianToJulianMicros, rebaseJulianToGregorianDays, rebaseJulianToGregorianMicros} import org.apache.spark.sql.errors.ExecutionErrors @@ -474,10 +474,10 @@ trait SparkDateTimeUtils { * order to distinguish between 0L and null. The following formats are allowed: * * `[+-]yyyy*` `[+-]yyyy*-[m]m` `[+-]yyyy*-[m]m-[d]d` `[+-]yyyy*-[m]m-[d]d ` - * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` - * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` - * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` - * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][ns][ns][ns][zone_id]` + * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][ns][ns][ns][zone_id]` + * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][ns][ns][ns][zone_id]` + * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][ns][ns][ns][zone_id]` * * where `zone_id` should have one of the forms: * - Z - Zulu time zone UTC+0 @@ -504,7 +504,8 @@ trait SparkDateTimeUtils { def isValidDigits(segment: Int, digits: Int): Boolean = { // A Long is able to represent a timestamp within [+-]200 thousand years val maxDigitsYear = 6 - // For the nanosecond part, more than 6 digits is allowed, but will be truncated. + // Fractional digits 1-6 form microseconds; digits 7-9 are retained as the sub-microsecond + // remainder in segments(9); only digits beyond the 9th are dropped. segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID (segment == 7 && digits <= 2) || @@ -516,8 +517,9 @@ trait SparkDateTimeUtils { var tz: Option[String] = None // Indices 0-6 hold year, month, day, hour, minute, second and the microsecond part of the // fractional second (digits 1-6). Index 9 is an output-only slot that holds the - // sub-microsecond remainder (fractional digits 7-9) as a value in [0, 999]; it is not touched - // by the parsing loop below. Indices 7-8 are used while validating a region-based zone id. + // sub-microsecond remainder (fractional digits 7-9) as a value in [0, 999]; it is never + // written by the parsing loop below. Indices 7-8 are written by the loop as `i` advances + // but their values are never read by any caller. val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0, 0) var i = 0 var currentSegmentValue = 0 @@ -749,7 +751,10 @@ trait SparkDateTimeUtils { val factor = precision match { case 7 => 100 case 8 => 10 - case _ => 1 + case 9 => 1 + case _ => + throw SparkException.internalError( + s"truncateNanosWithinMicro called with precision $precision outside [7, 9]") } ((nanosWithinMicro / factor) * factor).toShort } @@ -765,6 +770,9 @@ trait SparkDateTimeUtils { s: UTF8String, precision: Int, timeZoneId: ZoneId): Option[TimestampNanosVal] = { + if (precision < 7 || precision > 9) + throw SparkException.internalError( + s"stringToTimestampLTZNanos: precision $precision is out of range [7, 9]") try { val (segments, parsedZoneId, justTime) = parseTimestampString(s) if (segments.isEmpty) { @@ -815,6 +823,9 @@ trait SparkDateTimeUtils { s: UTF8String, precision: Int, allowTimeZone: Boolean = true): Option[TimestampNanosVal] = { + if (precision < 7 || precision > 9) + throw SparkException.internalError( + s"stringToTimestampNTZNanos: precision $precision is out of range [7, 9]") try { val (segments, zoneIdOpt, justTime) = parseTimestampString(s) if (segments.isEmpty || justTime || !allowTimeZone && zoneIdOpt.isDefined) { @@ -833,6 +844,12 @@ trait SparkDateTimeUtils { } } + /** + * ANSI variant of [[stringToTimestampNTZNanos]]. Throws [[org.apache.spark.SparkDateTimeException]] + * on invalid input. Uses `allowTimeZone = true`: a time zone component in the string is silently + * discarded rather than rejected. Callers that need strict NTZ rejection should call + * [[stringToTimestampNTZNanos]] directly with `allowTimeZone = false`. + */ def stringToTimestampNTZNanosAnsi( s: UTF8String, precision: Int, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampNanosParseSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampNanosParseSuite.scala index bba3ff576a5fa..4b31e38f0f905 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampNanosParseSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampNanosParseSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util import java.time.{ZoneId, ZoneOffset} -import org.apache.spark.{SparkDateTimeException, SparkFunSuite} +import org.apache.spark.{SparkDateTimeException, SparkException, SparkFunSuite} import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ import org.apache.spark.sql.catalyst.util.DateTimeUtils._ import org.apache.spark.unsafe.types.{TimestampNanosVal, UTF8String} @@ -146,12 +146,60 @@ class TimestampNanosParseSuite extends SparkFunSuite { date(9999, 12, 31, 23, 59, 59, 999999, ZoneOffset.UTC), 999.toShort)) } + test("null input returns None") { + assert(stringToTimestampNTZNanos(null, 9).isEmpty) + assert(stringToTimestampLTZNanos(null, 9, ZoneOffset.UTC).isEmpty) + } + test("invalid inputs return None") { assert(ntz("not a timestamp", 9).isEmpty) assert(ntz("", 9).isEmpty) assert(ltz("2015-13-40 99:99:99.123456789", 9, ZoneOffset.UTC).isEmpty) } + test("LTZ: time-only input uses the session zone's current date") { + // Time-only strings are accepted by the LTZ path (date is filled with LocalDate.now); + // they are rejected by the NTZ path because the date is indeterminate. + val result = ltz("12:03:17.123456789", 9, ZoneOffset.UTC) + assert(result.isDefined) + assert(result.get.nanosWithinMicro === 789.toShort) + assert(ntz("12:03:17.123456789", 9).isEmpty) + } + + test("pre-epoch (negative) timestamps with sub-microsecond fractions") { + // Exercises the yearSign path together with segments(9). + assert(ntz("-0001-01-01 00:00:00.000000001", 9).get === + TimestampNanosVal.fromParts( + date(-1, 1, 1, 0, 0, 0, 0, ZoneOffset.UTC), 1.toShort)) + assert(ntz("1582-10-14 23:59:59.999999999", 9).get === + TimestampNanosVal.fromParts( + date(1582, 10, 14, 23, 59, 59, 999999, ZoneOffset.UTC), 999.toShort)) + } + + test("truncateNanosWithinMicro throws internalError for out-of-range precision") { + // Precision must be in [7, 9]; anything outside is a caller bug and should surface loudly. + Seq(0, 6, 10, -1).foreach { p => + checkError( + exception = intercept[SparkException] { + stringToTimestampNTZNanos( + UTF8String.fromString("2020-01-01 00:00:00.123456789"), p) + }, + condition = "INTERNAL_ERROR", + parameters = Map( + "message" -> s"stringToTimestampNTZNanos: precision $p is out of range [7, 9]")) + } + } + + test("ANSI NTZ: time zone component in the string is silently discarded") { + // allowTimeZone defaults to true in the ANSI variant: the zone suffix is dropped, not + // rejected. Callers that need strict rejection must use stringToTimestampNTZNanos directly + // with allowTimeZone = false. + val result = stringToTimestampNTZNanosAnsi( + UTF8String.fromString("2015-03-18T12:03:17.123456789Z"), 9) + assert(result === + TimestampNanosVal.fromParts(date(2015, 3, 18, 12, 3, 17, 123456, ZoneOffset.UTC), 789.toShort)) + } + test("ANSI variants throw on invalid input") { val ntzValid = stringToTimestampNTZNanosAnsi( UTF8String.fromString("2015-01-02 00:00:00.123456789"), 9) From a3aafc154ea46d91c05850d006aec354a61c4e08 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Fri, 29 May 2026 20:21:07 +0200 Subject: [PATCH 4/4] Fix scalastyle violations in nanos string parsing code Co-authored-by: Isaac --- .../sql/catalyst/util/SparkDateTimeUtils.scala | 15 +++++++++------ .../catalyst/util/TimestampNanosParseSuite.scala | 3 ++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala index b5961268b7e04..09180b1dc97b5 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala @@ -770,9 +770,10 @@ trait SparkDateTimeUtils { s: UTF8String, precision: Int, timeZoneId: ZoneId): Option[TimestampNanosVal] = { - if (precision < 7 || precision > 9) + if (precision < 7 || precision > 9) { throw SparkException.internalError( s"stringToTimestampLTZNanos: precision $precision is out of range [7, 9]") + } try { val (segments, parsedZoneId, justTime) = parseTimestampString(s) if (segments.isEmpty) { @@ -823,9 +824,10 @@ trait SparkDateTimeUtils { s: UTF8String, precision: Int, allowTimeZone: Boolean = true): Option[TimestampNanosVal] = { - if (precision < 7 || precision > 9) + if (precision < 7 || precision > 9) { throw SparkException.internalError( s"stringToTimestampNTZNanos: precision $precision is out of range [7, 9]") + } try { val (segments, zoneIdOpt, justTime) = parseTimestampString(s) if (segments.isEmpty || justTime || !allowTimeZone && zoneIdOpt.isDefined) { @@ -845,10 +847,11 @@ trait SparkDateTimeUtils { } /** - * ANSI variant of [[stringToTimestampNTZNanos]]. Throws [[org.apache.spark.SparkDateTimeException]] - * on invalid input. Uses `allowTimeZone = true`: a time zone component in the string is silently - * discarded rather than rejected. Callers that need strict NTZ rejection should call - * [[stringToTimestampNTZNanos]] directly with `allowTimeZone = false`. + * ANSI variant of [[stringToTimestampNTZNanos]]. Throws + * [[org.apache.spark.SparkDateTimeException]] on invalid input. Uses `allowTimeZone = true`: a + * time zone component in the string is silently discarded rather than rejected. Callers that + * need strict NTZ rejection should call [[stringToTimestampNTZNanos]] directly with + * `allowTimeZone = false`. */ def stringToTimestampNTZNanosAnsi( s: UTF8String, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampNanosParseSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampNanosParseSuite.scala index 4b31e38f0f905..e5e8f05c69542 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampNanosParseSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampNanosParseSuite.scala @@ -197,7 +197,8 @@ class TimestampNanosParseSuite extends SparkFunSuite { val result = stringToTimestampNTZNanosAnsi( UTF8String.fromString("2015-03-18T12:03:17.123456789Z"), 9) assert(result === - TimestampNanosVal.fromParts(date(2015, 3, 18, 12, 3, 17, 123456, ZoneOffset.UTC), 789.toShort)) + TimestampNanosVal.fromParts( + date(2015, 3, 18, 12, 3, 17, 123456, ZoneOffset.UTC), 789.toShort)) } test("ANSI variants throw on invalid input") {