From e6e5f79bfed371dab293546b95aa198c490aa9b4 Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Sat, 30 May 2026 11:10:45 +0000 Subject: [PATCH 1/2] [SPARK-57173][SQL] Simplify regexp pattern-compile codegen by extracting a static Java helper ### What changes were proposed in this pull request? Add `ExpressionImplUtils.compileRegexPattern(String regex, int flags, String funcName)`, which wraps `Pattern.compile` and maps a `PatternSyntaxException` to the user-facing INVALID_PARAMETER_VALUE.PATTERN error. Route both the shared codegen (`RegExpUtils.initLastMatcherCode`, used by the whole regexp expression family -- RLike, RegExpReplace, RegExpExtract, RegExpExtractAll, RegExpInStr, etc.) and the eval helper (`RegExpUtils.getPatternAndLastRegex`) through it. `initLastMatcherCode` previously emitted a 5-line inline `try { Pattern.compile } catch (PatternSyntaxException)` block; it now emits a single helper call. The per-stage mutable-state caching (`lastRegex` / `pattern`) is preserved in the generated code. ### Why are the changes needed? Part of SPARK-56908 (umbrella). This block is emitted by every regexp expression in every stage that uses one; collapsing it to a single call shrinks the generated Java across the whole family. ### Does this PR introduce _any_ user-facing change? No. The compiled behavior is identical; only the emitted Java source text changes. ### How was this patch tested? ``` build/sbt "catalyst/testOnly *RegexpExpressionsSuite" ``` 21/21 pass (exercised both with and without whole-stage codegen). ### Was this patch authored or co-authored using generative AI tooling? Generated-by: Claude Code (Opus 4.8) Co-authored-by: Isaac --- .../expressions/ExpressionImplUtils.java | 17 +++++++++++++++++ .../expressions/regexpExpressions.scala | 19 ++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java index a5228edc33c83..01aac856b353c 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java @@ -25,6 +25,8 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; import javax.crypto.Cipher; import javax.crypto.spec.GCMParameterSpec; import javax.crypto.spec.IvParameterSpec; @@ -342,4 +344,19 @@ public static UTF8String quote(UTF8String str) { String sp = str.toString().replaceAll(qtChar, qtCharRep); return UTF8String.fromString(qtChar + sp + qtChar); } + + /** + * Compiles {@code regex} with the given {@code flags} for the regexp expression + * family, translating a {@link PatternSyntaxException} into the user-facing + * INVALID_PARAMETER_VALUE.PATTERN error. Shared by the regexp eval and codegen + * paths so the generated Java is a single call instead of an inline try/catch + * around {@code Pattern.compile}. + */ + public static Pattern compileRegexPattern(String regex, int flags, String funcName) { + try { + return Pattern.compile(regex, flags); + } catch (PatternSyntaxException e) { + throw QueryExecutionErrors.invalidPatternError(funcName, e.getPattern(), e); + } + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 5ad360a54e8d5..8b593222ab7ab 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -1253,17 +1253,14 @@ object RegExpUtils { val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex") val termPattern = ctx.addMutableState(classNamePattern, "pattern") val collationRegexFlags = CollationSupport.collationAwareRegexFlags(collationId) + val utils = classOf[ExpressionImplUtils].getName s""" |if (!$regexp.equals($termLastRegex)) { | // regex value changed - | try { - | UTF8String r = $regexp.clone(); - | $termPattern = $classNamePattern.compile(r.toString(), $collationRegexFlags); - | $termLastRegex = r; - | } catch (java.util.regex.PatternSyntaxException e) { - | throw QueryExecutionErrors.invalidPatternError("$prettyName", e.getPattern(), e); - | } + | UTF8String r = $regexp.clone(); + | $termPattern = $utils.compileRegexPattern(r.toString(), $collationRegexFlags, "$prettyName"); + | $termLastRegex = r; |} |java.util.regex.Matcher $matcher = $termPattern.matcher($subject.toString()); |""".stripMargin @@ -1272,12 +1269,8 @@ object RegExpUtils { def getPatternAndLastRegex(p: Any, prettyName: String, collationId: Int): (Pattern, UTF8String) = { val r = p.asInstanceOf[UTF8String].clone() - val pattern = try { - Pattern.compile(r.toString, CollationSupport.collationAwareRegexFlags(collationId)) - } catch { - case e: PatternSyntaxException => - throw QueryExecutionErrors.invalidPatternError(prettyName, e.getPattern, e) - } + val pattern = ExpressionImplUtils.compileRegexPattern( + r.toString, CollationSupport.collationAwareRegexFlags(collationId), prettyName) (pattern, r) } } From 09b4bc2486963083f93cc4b16a99dce0962d6e71 Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Sat, 30 May 2026 11:23:01 +0000 Subject: [PATCH 2/2] Wrap generated compileRegexPattern call to satisfy scalastyle 100-char limit Co-authored-by: Isaac --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 8b593222ab7ab..c2c01d2c78159 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -1259,7 +1259,8 @@ object RegExpUtils { |if (!$regexp.equals($termLastRegex)) { | // regex value changed | UTF8String r = $regexp.clone(); - | $termPattern = $utils.compileRegexPattern(r.toString(), $collationRegexFlags, "$prettyName"); + | $termPattern = + | $utils.compileRegexPattern(r.toString(), $collationRegexFlags, "$prettyName"); | $termLastRegex = r; |} |java.util.regex.Matcher $matcher = $termPattern.matcher($subject.toString());