From 45165dacca76b93c23d0351e61614ab792f976e6 Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Sat, 30 May 2026 11:06:26 +0000 Subject: [PATCH] [SPARK-57172][SQL] Simplify Crc32 codegen by extracting a static Java helper ### What changes were proposed in this pull request? Add `ExpressionImplUtils.crc32(byte[] bytes)` and route `Crc32`'s eval and codegen paths through it. `Crc32.doGenCode` previously emitted a 3-line allocate / `update` / `getValue` sequence inline; it now emits a single `ExpressionImplUtils.crc32(...)` call, and the eval path calls the same helper. This is a plain (non-ANSI, non-try/catch) type-independent block, in line with the broadened goal of SPARK-56908 to move fixed generated-Java logic into static Java helpers. ### Why are the changes needed? Part of SPARK-56908 (umbrella). Collapsing the inline CRC32 sequence to one call shrinks the generated Java for every stage that computes `crc32`. ### Does this PR introduce _any_ user-facing change? No. The compiled behavior is identical; only the emitted Java source text changes. ### How was this patch tested? ``` build/sbt "catalyst/testOnly *HashExpressionsSuite" ``` 40/40 pass, including `crc32` (exercised both with and without whole-stage codegen). ### Was this patch authored or co-authored using generative AI tooling? Generated-by: Claude Code (Opus 4.8) Co-authored-by: Isaac --- .../catalyst/expressions/ExpressionImplUtils.java | 12 ++++++++++++ .../spark/sql/catalyst/expressions/hash.scala | 14 +++----------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java index a5228edc33c83..7bad7c430b862 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java @@ -25,6 +25,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; +import java.util.zip.CRC32; import javax.crypto.Cipher; import javax.crypto.spec.GCMParameterSpec; import javax.crypto.spec.IvParameterSpec; @@ -342,4 +343,15 @@ public static UTF8String quote(UTF8String str) { String sp = str.toString().replaceAll(qtChar, qtCharRep); return UTF8String.fromString(qtChar + sp + qtChar); } + + /** + * Computes the CRC32 checksum of {@code bytes} for the {@code crc32} expression. + * Shared by the eval and codegen paths so the per-stage generated Java is a + * single call rather than an inline allocate / update / getValue sequence. + */ + public static long crc32(byte[] bytes) { + CRC32 checksum = new CRC32(); + checksum.update(bytes, 0, bytes.length); + return checksum.getValue(); + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala index 0f27dee9dbc84..0b0c84176691e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.expressions import java.math.{BigDecimal, RoundingMode} import java.util.concurrent.TimeUnit._ -import java.util.zip.CRC32 import scala.annotation.tailrec @@ -214,20 +213,13 @@ case class Crc32(child: Expression) override def contextIndependentFoldable: Boolean = child.contextIndependentFoldable protected override def nullSafeEval(input: Any): Any = { - val checksum = new CRC32 - checksum.update(input.asInstanceOf[Array[Byte]], 0, input.asInstanceOf[Array[Byte]].length) - checksum.getValue + ExpressionImplUtils.crc32(input.asInstanceOf[Array[Byte]]) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val CRC32 = "java.util.zip.CRC32" - val checksum = ctx.freshName("checksum") + val utils = classOf[ExpressionImplUtils].getName nullSafeCodeGen(ctx, ev, value => { - s""" - $CRC32 $checksum = new $CRC32(); - $checksum.update($value, 0, $value.length); - ${ev.value} = $checksum.getValue(); - """ + s"${ev.value} = $utils.crc32($value);" }) }