From d12908f34e1402d2efe3327cc9e6f143b87f1838 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 3 Mar 2026 13:52:52 -0600 Subject: [PATCH 1/4] Remove DSLNode-based implementation The recursive DSLTree approach required nodes nested in nodes. The DSLTree implementation flattened this tree, but didn't remove the nested nodes, instead replacing them with placeholder entries that were unused, in order to reduce the code change. This change completes the transition, modifying the nodes to no longer store their children. With this change, we can also remove a significant amount of unused code. The change includes some expanded test coverage of the RegexBuilder syntax conversion, as well. Note that even though `DSLTree.Node` no longer requires declaration as `indirect`, compilation times are worsened if we remove the indirection. --- .../ByteCodeGen+DSLList.swift | 64 +- Sources/_StringProcessing/ByteCodeGen.swift | 683 +----------------- Sources/_StringProcessing/Compiler.swift | 23 +- .../_StringProcessing/LiteralPrinter.swift | 161 +---- .../Optimizations/AutoPossessification.swift | 36 +- .../_StringProcessing/PrintAsPattern.swift | 386 ++++------ .../Regex/ASTConversion.swift | 98 +-- Sources/_StringProcessing/Regex/Core.swift | 18 +- Sources/_StringProcessing/Regex/DSLList.swift | 133 ++-- Sources/_StringProcessing/Regex/DSLTree.swift | 587 ++------------- Sources/_StringProcessing/Regex/Options.swift | 2 +- .../Utility/RegexFactory.swift | 26 +- Tests/RegexTests/CaptureTests.swift | 2 +- Tests/RegexTests/OptimizationTests.swift | 4 +- Tests/RegexTests/RenderDSLTests.swift | 95 ++- 15 files changed, 418 insertions(+), 1900 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift index 9432764e5..be4f92c05 100644 --- a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift +++ b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift @@ -73,19 +73,19 @@ fileprivate extension Compiler.ByteCodeGen { return nil // In an alternation, all of its children must match only at start. - case .orderedChoice(let children): - for _ in 0.. 0 { return _guaranteesForwardProgressImpl(list, position: &position) @@ -719,7 +719,7 @@ fileprivate extension Compiler.ByteCodeGen { } else { return false } - case .nonCapturingGroup(let groupKind, _): + case .nonCapturingGroup(let groupKind): // .nonCapture nonCapturingGroups are ignored during compilation guard groupKind.ast == .nonCapture else { return false @@ -751,15 +751,13 @@ fileprivate extension Compiler.ByteCodeGen { guard let node = list.popFirst() else { return nil } switch node { - case let .orderedChoice(children): - let n = children.count + case let .orderedChoice(n): try emitAlternation(&list, alternationCount: n) - - case let .concatenation(children): - let n = children.count + + case let .concatenation(n): try emitConcatenation(&list, componentCount: n) - case let .capture(name, refId, _, transform): + case let .capture(name, refId, transform): options.beginScope() defer { options.endScope() } @@ -793,19 +791,19 @@ fileprivate extension Compiler.ByteCodeGen { builder.buildTransformCapture(cap, fn) } - case let .nonCapturingGroup(kind, _): + case let .nonCapturingGroup(kind): try emitNoncapturingGroup(kind.ast, &list) - case .ignoreCapturesInTypedOutput(_): + case .ignoreCapturesInTypedOutput: try emitNode(&list) - case .limitCaptureNesting(_): + case .limitCaptureNesting: return try emitNode(&list) case .conditional: throw Unsupported("Conditionals") - case let .quantification(amt, kind, _): + case let .quantification(amt, kind): try emitQuantification(amt.ast, kind, &list) case let .customCharacterClass(ccc): @@ -852,19 +850,17 @@ extension Compiler.ByteCodeGen { ) throws { guard let node = list.popFirst() else { return } switch node { - case let .orderedChoice(children): - let n = children.count + case let .orderedChoice(n): for _ in 0.. MEProgram { - // If the whole regex is a matcher, then the whole-match value - // is the constructed value. Denote that the current value - // register is the processor's value output. - switch root { - case .matcher: - builder.denoteCurrentValueIsWholeMatchValue() - default: - break - } - - try emitNode(root) - - builder.canOnlyMatchAtStart = root.canOnlyMatchAtStart() - builder.buildAccept() - return try builder.assemble() - } -} - extension Compiler.ByteCodeGen { mutating func emitAtom(_ a: DSLTree.Atom) throws { defer { @@ -279,9 +259,15 @@ extension Compiler.ByteCodeGen { mutating func emitDot() throws { if options.dotMatchesNewline { if options.usesNSRECompatibleDot { - try emitAlternation([ - .atom(.characterClass(.newlineSequence)), - .atom(.anyNonNewline)]) + // Custom expansion of emitAlternation for (?:newlineSequence|anyNonNewline) + let done = builder.makeAddress() + let next = builder.makeAddress() + builder.buildSave(next) + emitCharacterClass(.newlineSequence) + builder.buildBranch(to: done) + builder.label(next) + emitAnyNonNewline() + builder.label(done) } else { emitAny() } @@ -326,126 +312,6 @@ extension Compiler.ByteCodeGen { builder.label(done) } - mutating func emitAlternation( - _ children: [DSLTree.Node] - ) throws { - try emitAlternationGen(children, withBacktracking: true) { - try $0.emitNode($1) - } - } - - mutating func emitConcatenationComponent( - _ node: DSLTree.Node - ) throws { - // TODO: Should we do anything special since we can - // be glueing sub-grapheme components together? - try emitNode(node) - } - - mutating func emitPositiveLookahead(_ child: DSLTree.Node) throws { - /* - save(restoringAt: success) - save(restoringAt: intercept) - // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from - fail(preservingCaptures: true) // ->success - intercept: - clearSavePoint // remove success - fail // propagate failure - success: - ... - */ - let intercept = builder.makeAddress() - let success = builder.makeAddress() - - builder.buildSave(success) - builder.buildSave(intercept) - try emitNode(child) - builder.buildClearThrough(intercept) - builder.buildFail(preservingCaptures: true) // Lookahead succeeds here - - builder.label(intercept) - builder.buildClear() - builder.buildFail() - - builder.label(success) - } - - mutating func emitNegativeLookahead(_ child: DSLTree.Node) throws { - /* - save(restoringAt: success) - save(restoringAt: intercept) - // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from - clearSavePoint // remove success - fail // propagate failure - intercept: - fail // ->success - success: - ... - */ - let intercept = builder.makeAddress() - let success = builder.makeAddress() - - builder.buildSave(success) - builder.buildSave(intercept) - try emitNode(child) - builder.buildClearThrough(intercept) - builder.buildClear() - builder.buildFail() - - builder.label(intercept) - builder.buildFail() - - builder.label(success) - } - - mutating func emitLookaround( - _ kind: (forwards: Bool, positive: Bool), - _ child: DSLTree.Node - ) throws { - guard kind.forwards else { - throw Unsupported("backwards assertions") - } - if kind.positive { - try emitPositiveLookahead(child) - } else { - try emitNegativeLookahead(child) - } - } - - mutating func emitAtomicNoncapturingGroup( - _ child: DSLTree.Node - ) throws { - /* - save(continuingAt: success) - save(restoringAt: intercept) - // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from - fail(preservingCaptures: true) // ->success - intercept: - clearSavePoint // remove success - fail // propagate failure - success: - ... - */ - - let intercept = builder.makeAddress() - let success = builder.makeAddress() - - builder.buildSaveAddress(success) - builder.buildSave(intercept) - try emitNode(child) - builder.buildClearThrough(intercept) - builder.buildFail(preservingCaptures: true) // Atomic group succeeds here - - builder.label(intercept) - builder.buildClear() - builder.buildFail() - - builder.label(success) - } - mutating func emitMatcher( _ matcher: @escaping _MatcherInterface ) -> ValueRegister { @@ -463,358 +329,6 @@ extension Compiler.ByteCodeGen { return valReg } - mutating func emitNoncapturingGroup( - _ kind: AST.Group.Kind, - _ child: DSLTree.Node - ) throws { - assert(!kind.isCapturing) - - options.beginScope() - defer { options.endScope() } - - if let lookaround = kind.lookaroundKind { - try emitLookaround(lookaround, child) - return - } - - switch kind { - case .lookahead, .negativeLookahead, - .lookbehind, .negativeLookbehind: - throw Unreachable("TODO: reason") - - case .capture, .namedCapture, .balancedCapture: - throw Unreachable("These should produce a capture node") - - case .changeMatchingOptions(let optionSequence): - if !hasEmittedFirstMatchableAtom { - builder.initialOptions.apply(optionSequence) - } - options.apply(optionSequence) - try emitNode(child) - - case .atomicNonCapturing: - try emitAtomicNoncapturingGroup(child) - - default: - // FIXME: Other kinds... - try emitNode(child) - } - } - - mutating func emitQuantification( - _ amount: AST.Quantification.Amount, - _ kind: DSLTree.QuantificationKind, - _ child: DSLTree.Node - ) throws { - let updatedKind = kind.applying(options: options) - - let (low, high) = amount.bounds - guard let low = low else { - throw Unreachable("Must have a lower bound") - } - switch (low, high) { - case (_, 0): - // TODO: Should error out earlier, maybe DSL and parser - // has validation logic? - return - case let (n, m?) where n > m: - // TODO: Should error out earlier, maybe DSL and parser - // has validation logic? - return - - case let (n, m) where m == nil || n <= m!: - // Ok - break - default: - throw Unreachable("TODO: reason") - } - - // Compiler and/or parser should enforce these invariants - // before we are called - assert(high != 0) - assert((0...(high ?? Int.max)).contains(low)) - - let maxExtraTrips: Int? - if let h = high { - maxExtraTrips = h - low - } else { - maxExtraTrips = nil - } - let minTrips = low - assert((maxExtraTrips ?? 1) >= 0) - - if tryEmitFastQuant(child, updatedKind, minTrips, maxExtraTrips) { - return - } - - // The below is a general algorithm for bounded and unbounded - // quantification. It can be specialized when the min - // is 0 or 1, or when extra trips is 1 or unbounded. - // - // Stuff inside `<` and `>` are decided at compile time, - // while run-time values stored in registers start with a `%` - _ = """ - min-trip-count control block: - if %minTrips is zero: - goto exit-policy control block - else: - decrement %minTrips and fallthrough - - loop-body: - : - mov currentPosition %pos - evaluate the subexpression - : - if %pos is currentPosition: - goto exit - goto min-trip-count control block - - exit-policy control block: - if %maxExtraTrips is zero: - goto exit - else: - decrement %maxExtraTrips and fallthrough - - : - save exit and goto loop-body - : - ratchet and goto loop - : - save loop-body and fallthrough (i.e. goto exit) - - exit - ... the rest of the program ... - """ - - // Specialization based on `minTrips` for 0 or 1: - _ = """ - min-trip-count control block: - : - goto exit-policy - : - /* fallthrough */ - - loop-body: - evaluate the subexpression - - /* fallthrough */ - """ - - // Specialization based on `maxExtraTrips` for 0 or unbounded - _ = """ - exit-policy control block: - : - goto exit - : - /* fallthrough */ - """ - - /* - NOTE: These specializations don't emit the optimal - code layout (e.g. fallthrough vs goto), but that's better - done later (not prematurely) and certainly better - done by an optimizing compiler. - - NOTE: We're intentionally emitting essentially the same - algorithm for all quantifications for now, for better - testing and surfacing difficult bugs. We can specialize - for other things, like `.*`, later. - - When it comes time for optimizing, we can also look into - quantification instructions (e.g. reduce save-point traffic) - */ - - let minTripsControl = builder.makeAddress() - let loopBody = builder.makeAddress() - let exitPolicy = builder.makeAddress() - let exit = builder.makeAddress() - - // We'll need registers if we're (non-trivially) bounded - let minTripsReg: IntRegister? - if minTrips > 1 { - minTripsReg = builder.makeIntRegister( - initialValue: minTrips) - } else { - minTripsReg = nil - } - - let maxExtraTripsReg: IntRegister? - if (maxExtraTrips ?? 0) > 0 { - maxExtraTripsReg = builder.makeIntRegister( - initialValue: maxExtraTrips!) - } else { - maxExtraTripsReg = nil - } - - // Set up a dummy save point for possessive to update - if updatedKind == .possessive { - builder.pushEmptySavePoint() - } - - // min-trip-count: - // condBranch(to: exitPolicy, ifZeroElseDecrement: %min) - builder.label(minTripsControl) - switch minTrips { - case 0: builder.buildBranch(to: exitPolicy) - case 1: break - default: - assert(minTripsReg != nil, "logic inconsistency") - builder.buildCondBranch( - to: exitPolicy, ifZeroElseDecrement: minTripsReg!) - } - - // FIXME: Possessive needs a "dummy" save point to ratchet - - // loop: - // - // branch min-trip-count - builder.label(loopBody) - - // if we aren't sure if the child node will have forward progress and - // we have an unbounded quantification - let startPosition: PositionRegister? - let emitPositionChecking = - (!optimizationsEnabled || !child.guaranteesForwardProgress) && - maxExtraTrips == nil - - if emitPositionChecking { - startPosition = builder.makePositionRegister() - builder.buildMoveCurrentPosition(into: startPosition!) - } else { - startPosition = nil - } - try emitNode(child) - if emitPositionChecking { - // in all quantifier cases, no matter what minTrips or maxExtraTrips is, - // if we have a successful non-advancing match, branch to exit because it - // can match an arbitrary number of times - builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!) - } - - if minTrips <= 1 { - // fallthrough - } else { - builder.buildBranch(to: minTripsControl) - } - - // exit-policy: - // condBranch(to: exit, ifZeroElseDecrement: %maxExtraTrips) - // - // - // Bool { - let isScalarSemantics = options.semanticLevel == .unicodeScalar - guard optimizationsEnabled - && minTrips <= QuantifyPayload.maxStorableTrips - && maxExtraTrips ?? 0 <= QuantifyPayload.maxStorableTrips - && kind != .reluctant else { - return false - } - switch child { - case .customCharacterClass(let ccc): - // ascii only custom character class - guard let bitset = ccc.asAsciiBitset(options) else { - return false - } - builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - - case .atom(let atom): - switch atom { - case .char(let c): - if options.isCaseInsensitive && c.isCased { - // Cased character with case-insensitive matching; match only as an ASCII bitset - guard let bitset = DSLTree.CustomCharacterClass(members: [.atom(atom)]).asAsciiBitset(options) else { - return false - } - builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - } else { - // Uncased character OR case-sensitive matching; match as a single scalar ascii value character - guard let val = c._singleScalarAsciiValue else { - return false - } - builder.buildQuantify(asciiChar: val, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - } - - case .any: - builder.buildQuantifyAny( - matchesNewlines: true, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - case .anyNonNewline: - builder.buildQuantifyAny( - matchesNewlines: false, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - case .dot: - builder.buildQuantifyAny( - matchesNewlines: options.dotMatchesNewline, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - - case .characterClass(let cc): - // Custom character class that consumes a single grapheme - let model = cc.asRuntimeModel(options) - builder.buildQuantify( - model: model, - kind, - minTrips, - maxExtraTrips, - isScalarSemantics: isScalarSemantics) - default: - return false - } - case .limitCaptureNesting(let node): - return tryEmitFastQuant(node, kind, minTrips, maxExtraTrips) - case .nonCapturingGroup(let groupKind, let node): - // .nonCapture nonCapturingGroups are ignored during compilation - guard groupKind.ast == .nonCapture else { - return false - } - return tryEmitFastQuant(node, kind, minTrips, maxExtraTrips) - default: - return false - } - return true - } - /// Coalesce any adjacent scalar members in a custom character class together. /// This is required in order to produce correct grapheme matching behavior. func coalescingCustomCharacterClassMembers( @@ -1185,185 +699,6 @@ extension Compiler.ByteCodeGen { try $0.emitCCCMember($1) } } - - mutating func emitConcatenation(_ children: [DSLTree.Node]) throws { - // Before emitting a concatenation, we need to flatten out any nested - // concatenations, and coalesce any adjacent characters and scalars, forming - // quoted literals of their contents, over which we can perform grapheme - // breaking. - func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { - switch node { - case .concatenation(let ch): - return ch.flatMap(flatten) - case .ignoreCapturesInTypedOutput(let n), .limitCaptureNesting(let n): - return flatten(n) - default: - return [node] - } - } - let children = children - .flatMap(flatten) - .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in - switch node { - case .atom(let a): - guard let c = a.literalCharacterValue else { return false } - str.append(c) - return true - case .quotedLiteral(let q): - str += q - return true - case .trivia: - // Trivia can be completely ignored if we've already coalesced - // something. - return !str.isEmpty - default: - return false - } - } - for child in children { - try emitConcatenationComponent(child) - } - } - - @discardableResult - mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? { - switch node { - - case let .orderedChoice(children): - try emitAlternation(children) - - case let .concatenation(children): - try emitConcatenation(children) - - case let .capture(name, refId, child, transform): - options.beginScope() - defer { options.endScope() } - - let cap = builder.makeCapture(id: refId, name: name) - builder.buildBeginCapture(cap) - let value = try emitNode(child) - builder.buildEndCapture(cap) - // If the child node produced a custom capture value, e.g. the result of - // a matcher, this should override the captured substring. - if let value { - builder.buildMove(value, into: cap) - } - // If there's a capture transform, apply it now. - if let transform = transform { - let fn = builder.makeTransformFunction { input, cap in - // If it's a substring capture with no custom value, apply the - // transform directly to the substring to avoid existential traffic. - // - // FIXME: separate out this code path. This is fragile, - // slow, and these are clearly different constructs - if let range = cap.range, cap.value == nil { - return try transform(input[range]) - } - - let value = constructExistentialOutputComponent( - from: input, - component: cap.deconstructed, - optionalCount: 0) - return try transform(value) - } - builder.buildTransformCapture(cap, fn) - } - - case let .nonCapturingGroup(kind, child): - try emitNoncapturingGroup(kind.ast, child) - - case let .ignoreCapturesInTypedOutput(child): - try emitNode(child) - - case let .limitCaptureNesting(child): - return try emitNode(child) - - case .conditional: - throw Unsupported("Conditionals") - - case let .quantification(amt, kind, child): - try emitQuantification(amt.ast, kind, child) - - case let .customCharacterClass(ccc): - if ccc.containsDot { - if !ccc.isInverted { - try emitDot() - } else { - throw Unsupported("Inverted any") - } - } else { - try emitCustomCharacterClass(ccc) - } - - case let .atom(a): - try emitAtom(a) - - case let .quotedLiteral(s): - emitQuotedLiteral(s) - - case .absentFunction: - throw Unsupported("absent function") - case .consumer: - throw Unsupported("consumer") - - case let .matcher(_, f): - return emitMatcher(f) - - case .characterPredicate: - throw Unsupported("character predicates") - - case .trivia, .empty: - return nil - } - return nil - } -} - -extension DSLTree.Node { - /// A Boolean value indicating whether this node advances the match position - /// on a successful match. - /// - /// For example, an alternation like `(a|b|c)` always advances the position - /// by a character, but `(a|b|)` has an empty branch, which matches without - /// advancing. - var guaranteesForwardProgress: Bool { - switch self { - case .orderedChoice(let children): - return children.allSatisfy { $0.guaranteesForwardProgress } - case .concatenation(let children): - return children.contains(where: { $0.guaranteesForwardProgress }) - case .capture(_, _, let node, _): - return node.guaranteesForwardProgress - case .nonCapturingGroup(let kind, let child): - switch kind.ast { - case .lookahead, .negativeLookahead, .lookbehind, .negativeLookbehind: - return false - default: return child.guaranteesForwardProgress - } - case .atom(let atom): - switch atom { - case .changeMatchingOptions, .assertion: return false - // Captures may be nil so backreferences may be zero length matches - case .backreference: return false - default: return true - } - case .trivia, .empty: - return false - case .quotedLiteral(let string): - return !string.isEmpty - case .consumer, .matcher: - // Allow zero width consumers and matchers - return false - case .customCharacterClass(let ccc): - return ccc.guaranteesForwardProgress - case .quantification(let amount, _, let child): - let (atLeast, _) = amount.ast.bounds - return atLeast ?? 0 > 0 && child.guaranteesForwardProgress - case .limitCaptureNesting(let node), .ignoreCapturesInTypedOutput(let node): - return node.guaranteesForwardProgress - default: return false - } - } } extension DSLTree.CustomCharacterClass { diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 89c8f5f34..09cdef90d 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -19,22 +19,13 @@ class Compiler { private var compileOptions: _CompileOptions = .default init(ast: AST) { - self.tree = DSLList(tree: ast.dslTree) - } - - init(tree: DSLTree) { - self.tree = DSLList(tree: tree) + self.tree = DSLList(ast: ast) } init(list: DSLList) { self.tree = list } - init(tree: DSLTree, compileOptions: _CompileOptions) { - self.tree = DSLList(tree: tree) - self.compileOptions = compileOptions - } - init(tree: DSLList, compileOptions: _CompileOptions) { self.tree = tree self.compileOptions = compileOptions @@ -44,20 +35,8 @@ class Compiler { try emitViaList() } - __consuming func emitViaTree() throws -> MEProgram { - // TODO: Handle global options - _ = ByteCodeGen( - options: options, - compileOptions: - compileOptions, - captureList: tree.captureList) - fatalError() -// return try codegen.emitRoot(tree.root) - } - __consuming func emitViaList() throws -> MEProgram { // TODO: Handle global options -// var dslList = DSLList(tree: tree) var codegen = ByteCodeGen( options: options, compileOptions: diff --git a/Sources/_StringProcessing/LiteralPrinter.swift b/Sources/_StringProcessing/LiteralPrinter.swift index d9cdbb04e..e841e4b5d 100644 --- a/Sources/_StringProcessing/LiteralPrinter.swift +++ b/Sources/_StringProcessing/LiteralPrinter.swift @@ -102,12 +102,12 @@ extension LiteralPrinter { } switch node { - case let .orderedChoice(children): - try outputAlternation(&list, count: children.count) - case let .concatenation(children): - try outputConcatenation(&list, count: children.count) + case let .orderedChoice(count): + try outputAlternation(&list, count: count) + case let .concatenation(count): + try outputConcatenation(&list, count: count) - case let .capture(name, nil, _, nil): + case let .capture(name, nil, nil): options.beginScope() defer { options.endScope() } try outputCapture(&list, name: name) @@ -116,7 +116,7 @@ extension LiteralPrinter { try inconvertible(node) return - case let .nonCapturingGroup(kind, _): + case let .nonCapturingGroup(kind): guard let kindPattern = kind._patternString else { try inconvertible(node) return @@ -131,10 +131,10 @@ extension LiteralPrinter { try outputList(&list) output(")") - case .ignoreCapturesInTypedOutput(_), - .limitCaptureNesting(_): + case .ignoreCapturesInTypedOutput, + .limitCaptureNesting: try outputList(&list) - case let .quantification(amount, kind, _): + case let .quantification(amount, kind): try outputQuantification(&list, amount: amount, kind: kind) case let .customCharacterClass(charClass): outputCustomCharacterClass(charClass) @@ -182,8 +182,8 @@ extension LiteralPrinter { func requiresGrouping(_ list: ArraySlice) -> Bool { guard let node = list.first else { return false } // malformed? switch node { - case .concatenation(let children): - switch children.count { + case .concatenation(let count): + switch count { case 0: return false case 1: @@ -239,144 +239,7 @@ extension LiteralPrinter { } } -extension LiteralPrinter { - mutating func outputNode(_ node: DSLTree.Node) { - switch node { - case let .orderedChoice(children): - outputAlternation(children) - case let .concatenation(children): - outputConcatenation(children) - - case let .capture(name, nil, child, nil): - options.beginScope() - defer { options.endScope() } - outputCapture(name, child) - case .capture: - // Captures that use a reference or a transform are unsupported - saveInconvertible(node) - - case let .nonCapturingGroup(kind, child): - guard let kindPattern = kind._patternString else { - saveInconvertible(node) - return - } - options.beginScope() - defer { options.endScope() } - - output(kindPattern) - if case .changeMatchingOptions(let optionSequence) = kind.ast { - options.apply(optionSequence) - } - outputNode(child) - output(")") - - case let .ignoreCapturesInTypedOutput(child), - let .limitCaptureNesting(child): - outputNode(child) - case let .quantification(amount, kind, node): - outputQuantification(amount, kind, node) - case let .customCharacterClass(charClass): - outputCustomCharacterClass(charClass) - case let .atom(atom): - outputAtom(atom) - case let .quotedLiteral(literal): - output(prepareQuotedLiteral(literal)) - - case .trivia(_): - // TODO: Include trivia? - return - case .empty: - return - - case .conditional, .absentFunction, .consumer, .matcher, .characterPredicate: - saveInconvertible(node) - } - } - - mutating func outputAlternation(_ children: [DSLTree.Node]) { - guard let first = children.first else { return } - - outputNode(first) - for child in children.dropFirst() { - output("|") - outputNode(child) - } - } - - mutating func outputConcatenation(_ children: [DSLTree.Node]) { - for child in children { - outputNode(child) - } - } - - mutating func outputCapture(_ name: String?, _ child: DSLTree.Node) { - if let name { - output("(?<\(name)>") - } else { - output("(") - } - outputNode(child) - output(")") - } - - func requiresGrouping(_ node: DSLTree.Node) -> Bool { - switch node { - case .concatenation(let children): - switch children.count { - case 0: - return false - case 1: - return requiresGrouping(children.first!) - default: - return true - } - - case .quotedLiteral(let literal): - return prepareQuotedLiteral(literal).count > 1 - - default: - return false - } - } - - mutating func outputQuantification( - _ amount: DSLTree._AST.QuantificationAmount, - _ kind: DSLTree.QuantificationKind, - _ child: DSLTree.Node - ) { - // RegexBuilder regexes can have children that need - if requiresGrouping(child) { - output("(?:") - outputNode(child) - output(")") - } else { - outputNode(child) - } - - switch amount.ast { - case .zeroOrMore: - output("*") - case .oneOrMore: - output("+") - case .zeroOrOne: - output("?") - case let .exactly(n): - output("{\(n.value!)}") - case let .nOrMore(n): - output("{\(n.value!),}") - case let .upToN(n): - output("{,\(n.value!)}") - case let .range(low, high): - output("{\(low.value!),\(high.value!)}") - #if RESILIENT_LIBRARIES - @unknown default: - fatalError() - #endif - } - - outputQuantificationKind(kind) - } - +extension LiteralPrinter { mutating func outputQuantificationKind(_ kind: DSLTree.QuantificationKind) { guard let astKind = kind.quantificationKind?.ast else { // We can treat this as if the current default had been given explicity. diff --git a/Sources/_StringProcessing/Optimizations/AutoPossessification.swift b/Sources/_StringProcessing/Optimizations/AutoPossessification.swift index 46ec4d460..77f5ae868 100644 --- a/Sources/_StringProcessing/Optimizations/AutoPossessification.swift +++ b/Sources/_StringProcessing/Optimizations/AutoPossessification.swift @@ -39,10 +39,10 @@ extension DSLList { // In a concatenation, the first definitive child provides the answer, // and then we need to skip past (in some cases at least) the remaining // concatenation elements. - case .concatenation(let children): + case .concatenation(let count): var result: DSLTree.Atom?? = nil var i = 0 - while i < children.count { + while i < count { i += 1 position += 1 if let r = _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges) { @@ -50,8 +50,8 @@ extension DSLList { break } } - - for _ in i..( - _ ast: T - ) -> Bool { - if let max = maxTopDownLevels, depth >= max { - return true - } - if let min = minBottomUpLevels, ast.height <= min { - return true - } - return false - } - - mutating func printBackoff(_ node: DSLTree.Node) { - precondition(node.astNode != nil, "unconverted node") - printAsCanonical( - .init(node.astNode!, globalOptions: nil, diags: Diagnostics()), - delimiters: true) - } - mutating func printAsPattern(_ ast: AST) { - // TODO: Handle global options... - let node = ast.root.dslTreeNode - + let list = DSLList(ast: ast) + // If we have any named captures, create references to those above the regex. - let namedCaptures = node.getNamedCaptures() - + let namedCaptures = list.getNamedCaptures() + for namedCapture in namedCaptures { print("let \(namedCapture) = Reference(Substring.self)") } printBlock("Regex") { printer in - printer.printAsPattern(convertedFromAST: node, isTopLevel: true) + var slice = list.nodes[...] + printer.printAsPatternFromList(&slice, isTopLevel: true) } printInlineMatchingOptions() } - mutating func printInlineMatchingOptions() { - while !inlineMatchingOptions.isEmpty { - let (options, condition) = popMatchingOptions() - - printIndented { printer in - for option in options { - switch option.kind { - case .asciiOnlyDigit: - printer.print(".asciiOnlyDigits(\(condition))") - - case .asciiOnlyPOSIXProps: - printer.print(".asciiOnlyCharacterClasses(\(condition))") - - case .asciiOnlySpace: - printer.print(".asciiOnlyWhitespace(\(condition))") - - case .asciiOnlyWord: - printer.print(".asciiOnlyWordCharacters(\(condition))") - - case .caseInsensitive: - printer.print(".ignoresCase(\(condition))") - - case .multiline: - printer.print(".anchorsMatchLineEndings(\(condition))") - - case .reluctantByDefault: - // This is handled by altering every OneOrMore, etc by changing each - // individual repetition behavior instead of creating a nested regex. - continue - - case .singleLine: - printer.print(".dotMatchesNewlines(\(condition))") - - default: - break - } - } - } - - print("}") - } - } - - // FIXME: Use of back-offs like height and depth - // imply that this DSLTree node has a corresponding - // AST. That's not always true, and it would be nice - // to have a non-backing-off pretty-printer that this - // can defer to. - private mutating func printAsPattern( - convertedFromAST node: DSLTree.Node, isTopLevel: Bool = false + private mutating func printAsPatternFromList( + _ list: inout ArraySlice, + isTopLevel: Bool = false ) { - if patternBackoff(DSLTree._Tree(node)) { - printBackoff(node) - return - } + guard let node = list.popFirst() else { return } switch node { - - case let .orderedChoice(a): + case .orderedChoice(let count): printBlock("ChoiceOf") { printer in - a.forEach { - printer.printAsPattern(convertedFromAST: $0) + for _ in 0.., + count: Int, + isTopLevel: Bool ) { - // We need to coalesce any adjacent character and scalar elements into a - // string literal, preserving scalar syntax. - let nodes = nodes - .map { NodeToPrint.dslNode($0.lookingThroughConvertedLiteral) } - .coalescing( - with: StringLiteralBuilder(), into: { .stringLiteral($0.result) } - ) { literal, node in - guard case .dslNode(let node) = node else { return false } - switch node { - case let .atom(.char(c)): - literal.append(c) - return true - case let .atom(.scalar(s)): - literal.append(unescaped: s._dslBase) - return true - case .quotedLiteral(let q): - literal.append(q) - return true - case .trivia: - // Trivia can be completely ignored if we've already coalesced - // something. - return !literal.isEmpty - default: - return false - } + if isTopLevel || count <= 1 { + for _ in 0.. [String] { - var result: [String] = [] - - switch self { - case .capture(let name?, _, _, _): - result.append(name) - - case .concatenation(let nodes): - for node in nodes { - result += node.getNamedCaptures() - } - - case .quantification(_, _, let node): - result += node.getNamedCaptures() - - default: - break - } - - return result - } -} diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 3bad55732..bfdabc8e9 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -11,23 +11,17 @@ internal import _RegexParser -extension AST { - var dslTree: DSLTree { - return DSLTree(.limitCaptureNesting(root.dslTreeNode)) - } -} - extension AST.Node { func convert(into list: inout [DSLTree.Node]) throws { switch self { case .alternation(let alternation): - list.append(.orderedChoice(Array(repeating: TEMP_FAKE_NODE, count: alternation.children.count))) + list.append(.orderedChoice(alternation.children.count)) for child in alternation.children { try child.convert(into: &list) } case .concatenation(_): let coalesced = self.coalescedChildren - list.append(.concatenation(Array(repeating: TEMP_FAKE_NODE, count: coalesced.count))) + list.append(.concatenation(coalesced.count)) for child in coalesced { try child.convert(into: &list) } @@ -35,24 +29,24 @@ extension AST.Node { let child = group.child switch group.kind.value { case .capture: - list.append(.capture(TEMP_FAKE_NODE)) + list.append(.capture()) try child.convert(into: &list) case .namedCapture(let name): - list.append(.capture(name: name.value, TEMP_FAKE_NODE)) + list.append(.capture(name: name.value)) try child.convert(into: &list) case .balancedCapture: throw Unsupported("TODO: balanced captures") default: - list.append(.nonCapturingGroup(.init(ast: group.kind.value), TEMP_FAKE_NODE)) + list.append(.nonCapturingGroup(.init(ast: group.kind.value))) try child.convert(into: &list) } case .conditional(let conditional): - list.append(.conditional(.init(ast: conditional.condition.kind), TEMP_FAKE_NODE, TEMP_FAKE_NODE)) + list.append(.conditional(.init(ast: conditional.condition.kind))) try conditional.trueBranch.convert(into: &list) try conditional.falseBranch.convert(into: &list) case .quantification(let quant): list.append( - .quantification(.init(ast: quant.amount.value), .syntax(.init(ast: quant.kind.value)), TEMP_FAKE_NODE)) + .quantification(.init(ast: quant.amount.value), .syntax(.init(ast: quant.kind.value)))) try quant.child.convert(into: &list) case .quote(let node): list.append(.quotedLiteral(node.literal)) @@ -154,84 +148,6 @@ extension AST.Node { } } -extension AST.Node { - /// Converts an AST node to a `convertedRegexLiteral` node. - var dslTreeNode: DSLTree.Node { - // Convert the top-level node without wrapping - func convert() throws -> DSLTree.Node { - switch self { - case let .alternation(v): - let children = v.children.map(\.dslTreeNode) - return .orderedChoice(children) - - case let .concatenation(v): - return .concatenation(v.children.map(\.dslTreeNode)) - - case let .group(v): - let child = v.child.dslTreeNode - switch v.kind.value { - case .capture: - return .capture(child) - case .namedCapture(let name): - return .capture(name: name.value, child) - case .balancedCapture: - throw Unsupported("TODO: balanced captures") - default: - return .nonCapturingGroup(.init(ast: v.kind.value), child) - } - - case let .conditional(v): - let trueBranch = v.trueBranch.dslTreeNode - let falseBranch = v.falseBranch.dslTreeNode - return .conditional( - .init(ast: v.condition.kind), trueBranch, falseBranch) - - case let .quantification(v): - let child = v.child.dslTreeNode - return .quantification( - .init(ast: v.amount.value), .syntax(.init(ast: v.kind.value)), child) - - case let .quote(v): - return .quotedLiteral(v.literal) - - case let .trivia(v): - return .trivia(v.contents) - - case .interpolation: - throw Unsupported("TODO: interpolation") - - case let .atom(v): - switch v.kind { - case .scalarSequence(let seq): - // The DSL doesn't have an equivalent node for scalar sequences. Splat - // them into a concatenation of scalars. - return .concatenation(seq.scalarValues.map { .atom(.scalar($0)) }) - default: - return .atom(v.dslTreeAtom) - } - - case let .customCharacterClass(ccc): - return .customCharacterClass(ccc.dslTreeClass) - - case .empty(_): - return .empty - - case let .absentFunction(abs): - // TODO: What should this map to? - return .absentFunction(.init(ast: abs)) - - #if RESILIENT_LIBRARIES - @unknown default: - fatalError() - #endif - } - } - - let converted = try! convert() - return converted - } -} - extension AST.CustomCharacterClass { var dslTreeClass: DSLTree.CustomCharacterClass { // TODO: Not quite 1-1 diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 425f64549..c1fbb36fa 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -11,8 +11,6 @@ internal import _RegexParser -let TEMP_FAKE_NODE = DSLTree.Node.empty - /// A type that represents a regular expression. /// /// You can use types that conform to `RegexComponent` as parameters to string @@ -199,10 +197,6 @@ extension Regex { self.list = DSLList(ast: ast) } - init(tree: DSLTree) { - self.list = DSLList(tree: tree) - } - init(list: DSLList) { self.list = list } @@ -261,11 +255,11 @@ extension Regex { // Use an existing concatenation if it's already the root; // otherwise, embed self and other in a new concatenation root. switch list.nodes[0] { - case .concatenation(let children): - list.nodes[0] = .concatenation(Array(repeating: TEMP_FAKE_NODE, count: children.count + 1)) + case .concatenation(let count): + list.nodes[0] = .concatenation(count + 1) list.nodes.append(contentsOf: other.nodes) default: - list.nodes.insert(.concatenation(Array(repeating: TEMP_FAKE_NODE, count: 2)), at: 0) + list.nodes.insert(.concatenation(2), at: 0) list.nodes.append(contentsOf: other.nodes) } return Regex(list: list) @@ -274,11 +268,11 @@ extension Regex { func alternating(with other: some Collection) -> Regex { var nodes = program.list.nodes switch nodes[0] { - case .orderedChoice(let children): - nodes[0] = .orderedChoice(Array(repeating: TEMP_FAKE_NODE, count: children.count + 1)) + case .orderedChoice(let count): + nodes[0] = .orderedChoice(count + 1) nodes.append(contentsOf: other) default: - nodes.insert(.orderedChoice(Array(repeating: TEMP_FAKE_NODE, count: 2)), at: 0) + nodes.insert(.orderedChoice(2), at: 0) nodes.append(contentsOf: other) } return Regex(list: DSLList(nodes)) diff --git a/Sources/_StringProcessing/Regex/DSLList.swift b/Sources/_StringProcessing/Regex/DSLList.swift index 98e478de4..8be1ccc7a 100644 --- a/Sources/_StringProcessing/Regex/DSLList.swift +++ b/Sources/_StringProcessing/Regex/DSLList.swift @@ -32,12 +32,8 @@ struct DSLList { self.nodes = nodes } - init(tree: DSLTree) { - self.nodes = Array(tree.depthFirst) - } - init(ast: AST) { - self.nodes = [.limitCaptureNesting(TEMP_FAKE_NODE)] + self.nodes = [.limitCaptureNesting] try! ast.root.convert(into: &nodes) } @@ -73,110 +69,63 @@ extension DSLTree.Node { return 0 case .orderedChoice(let c), .concatenation(let c): - return c.count + return c case .capture, .nonCapturingGroup, .quantification, .ignoreCapturesInTypedOutput, - .limitCaptureNesting, .conditional: + .limitCaptureNesting: return 1 + case .conditional: + return 2 + case .absentFunction: return 0 } } } -extension DSLTree { - struct DepthFirst: Sequence, IteratorProtocol { - typealias Element = DSLTree.Node - private var stack: [Frame] - private let getChildren: (Element) -> [Element] - - private struct Frame { - let node: Element - let children: [Element] - var nextIndex: Int = 0 - } - - fileprivate init( - root: Element, - getChildren: @escaping (Element) -> [Element] - ) { - self.getChildren = getChildren - self.stack = [Frame(node: root, children: getChildren(root))] +extension ArraySlice { + internal func skipNode(_ position: inout Int) { + guard position < endIndex else { + return } - - mutating func next() -> Element? { - guard let top = stack.popLast() else { return nil } - // Push children in reverse so leftmost comes out first. - for child in top.children.reversed() { - stack.append(Frame(node: child, children: getChildren(child))) + switch self[position] { + case let .orderedChoice(n): + for _ in 0.. { - internal func skipNode(_ position: inout Int) { - guard position < endIndex else { - return - } - switch self[position] { - case let .orderedChoice(children): - let n = children.count - for _ in 0.. Int? { switch nodes[position] { - case .concatenation(let children): + case .concatenation(let count): var position = position + 1 if findLast { - for _ in 0..<(children.count - 1) { + for _ in 0..<(count - 1) { skipNode(&position) position += 1 } @@ -213,8 +162,8 @@ extension DSLList { Loop: while i >= 0 { switch other.nodes[i] { - case .concatenation(let children): - other.nodes[i] = .concatenation(.init(repeating: .empty, count: children.count - 1)) + case .concatenation(let count): + other.nodes[i] = .concatenation(count - 1) break Loop case .limitCaptureNesting, .ignoreCapturesInTypedOutput: other.nodes.remove(at: i) @@ -225,3 +174,15 @@ extension DSLList { } } } + +extension DSLList { + internal func getNamedCaptures() -> [String] { + var result: [String] = [] + for node in nodes { + if case .capture(let name?, _, _) = node, !result.contains(name) { + result.append(name) + } + } + return result + } +} diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index f34d1d4d1..445785774 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -25,26 +25,26 @@ extension DSLTree { /// Matches each node in order. /// /// ... | ... | ... - case orderedChoice([Node]) + case orderedChoice(Int) /// Match each node in sequence. /// /// ... ... - case concatenation([Node]) + case concatenation(Int) /// Captures the result of a subpattern. /// /// (...), (?...) case capture( - name: String? = nil, reference: ReferenceID? = nil, Node, + name: String? = nil, reference: ReferenceID? = nil, CaptureTransform? = nil) /// Matches a noncapturing subpattern. - case nonCapturingGroup(_AST.GroupKind, Node) + case nonCapturingGroup(_AST.GroupKind) /// Marks all captures in a subpattern as ignored in strongly-typed output. - case ignoreCapturesInTypedOutput(Node) - case limitCaptureNesting(Node) + case ignoreCapturesInTypedOutput + case limitCaptureNesting // TODO: Consider splitting off grouped conditions, or have // our own kind @@ -53,13 +53,11 @@ extension DSLTree { /// /// (?(cond) true-branch | false-branch) /// - case conditional( - _AST.ConditionKind, Node, Node) + case conditional(_AST.ConditionKind) case quantification( _AST.QuantificationAmount, - QuantificationKind, - Node) + QuantificationKind) case customCharacterClass(CustomCharacterClass) @@ -384,125 +382,6 @@ typealias _CharacterPredicateInterface = ( */ -extension DSLTree.Node { - /// Indicates whether this node has at least one child node (among other - /// associated values). - var hasChildNodes: Bool { - switch self { - case .trivia, .empty, .quotedLiteral, - .consumer, .matcher, .characterPredicate, - .customCharacterClass, .atom: - return false - - case .orderedChoice(let c), .concatenation(let c): - return !c.isEmpty - - case .capture, .nonCapturingGroup, - .quantification, .ignoreCapturesInTypedOutput, .limitCaptureNesting, - .conditional: - return true - - case .absentFunction(let abs): - return !abs.ast.children.isEmpty - } - } - - @_spi(RegexBuilder) - public var children: [DSLTree.Node] { - switch self { - - case let .orderedChoice(v): return v - case let .concatenation(v): return v - - case let .capture(_, _, n, _): return [n] - case let .nonCapturingGroup(_, n): return [n] - case let .quantification(_, _, n): return [n] - case let .ignoreCapturesInTypedOutput(n): return [n] - case let .limitCaptureNesting(n): return [n] - - case let .conditional(_, t, f): return [t,f] - - case .trivia, .empty, .quotedLiteral, - .consumer, .matcher, .characterPredicate, - .customCharacterClass, .atom: - return [] - - case let .absentFunction(abs): - return abs.ast.children.map(\.dslTreeNode) - } - } - - public var coalescedChildren: [DSLTree.Node] { - // Before converting a concatenation in a tree to list form, we need to - // flatten out any nested concatenations, and coalesce any adjacent - // characters and scalars, forming quoted literals of their contents, - // over which we can perform grapheme breaking. - - func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { - switch node { - case .concatenation(let ch): - return ch.flatMap(flatten) - case .ignoreCapturesInTypedOutput(let n), .limitCaptureNesting(let n): - return flatten(n) - default: - return [node] - } - } - - switch self { - case let .orderedChoice(v): return v - case let .concatenation(v): - let children = v - .flatMap(flatten) - .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in - switch node { - case .atom(let a): - guard let c = a.literalCharacterValue else { return false } - str.append(c) - return true - case .quotedLiteral(let q): - str += q - return true - case .trivia: - // Trivia can be completely ignored if we've already coalesced - // something. - return !str.isEmpty - default: - return false - } - } - return children - - case let .capture(_, _, n, _): return [n] - case let .nonCapturingGroup(_, n): return [n] - case let .quantification(_, _, n): return [n] - case let .ignoreCapturesInTypedOutput(n): return [n] - case let .limitCaptureNesting(n): return [n] - - case let .conditional(_, t, f): return [t,f] - - case .trivia, .empty, .quotedLiteral, - .consumer, .matcher, .characterPredicate, - .customCharacterClass, .atom: - return [] - - case let .absentFunction(abs): - return abs.ast.children.map(\.dslTreeNode) - } - } -} - -extension DSLTree.Node { - var astNode: AST.Node? { - nil - } - - /// If this node is for a converted literal, look through it. - var lookingThroughConvertedLiteral: Self { - self - } -} - extension DSLTree.Atom { // Return the Character or promote a scalar to a Character var literalCharacterValue: Character? { @@ -524,48 +403,6 @@ extension DSLTree.Node { } } -extension DSLTree { - struct Options { - // TBD - } -} - -extension DSLTree { - /// Indicates whether this DSLTree contains any capture groups. - var hasCapture: Bool { - root.hasCapture - } -} -extension DSLTree.Node { - /// Indicates whether this DSLTree node contains any capture groups. - var hasCapture: Bool { - switch self { - case .capture: - return true - default: - return self.children.any(\.hasCapture) - } - } -} - -extension DSLTree.Node { - func appending(_ newNode: DSLTree.Node) -> DSLTree.Node { - if case .concatenation(let components) = self { - return .concatenation(components + [newNode]) - } - return .concatenation([self, newNode]) - } - - func appendingAlternationCase( - _ newNode: DSLTree.Node - ) -> DSLTree.Node { - if case .orderedChoice(let components) = self { - return .orderedChoice(components + [newNode]) - } - return .orderedChoice([self, newNode]) - } -} - @_spi(RegexBuilder) public struct ReferenceID: Hashable { private static var counter: Int = 0 @@ -697,124 +534,41 @@ struct CaptureTransform: Hashable, CustomStringConvertible { } extension CaptureList.Builder { - mutating func addCaptures( - of node: DSLTree.Node, optionalNesting nesting: OptionalNesting, visibleInTypedOutput: Bool - ) { - switch node { - case let .orderedChoice(children): - for child in children { - addCaptures(of: child, optionalNesting: nesting.addingOptional, visibleInTypedOutput: visibleInTypedOutput) - } - - case let .concatenation(children): - for child in children { - addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) - } - - case let .capture(name, _, child, transform): - captures.append(.init( - name: name, - type: transform?.resultType ?? child.wholeMatchType, - optionalDepth: nesting.depth, visibleInTypedOutput: visibleInTypedOutput, .fake)) - addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) - - case let .nonCapturingGroup(kind, child): - assert(!kind.ast.isCapturing) - addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) - - case let .ignoreCapturesInTypedOutput(child): - addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: false) - - case let .limitCaptureNesting(child): - addCaptures(of: child, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) - - case let .conditional(cond, trueBranch, falseBranch): - switch cond.ast { - case .group(let g): - addCaptures(of: .group(g), optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) - default: - break - } - - addCaptures(of: trueBranch, optionalNesting: nesting.addingOptional, visibleInTypedOutput: visibleInTypedOutput) - addCaptures(of: falseBranch, optionalNesting: nesting.addingOptional, visibleInTypedOutput: visibleInTypedOutput) - - case let .quantification(amount, _, child): - var optNesting = nesting - if amount.ast.bounds.atLeast == 0 { - optNesting = optNesting.addingOptional - } - addCaptures(of: child, optionalNesting: optNesting, visibleInTypedOutput: visibleInTypedOutput) - - case let .absentFunction(abs): - switch abs.ast.kind { - case .expression(_, _, let child): - addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) - case .clearer, .repeater, .stopper: - break - #if RESILIENT_LIBRARIES - @unknown default: - fatalError() - #endif - } - -// case let .convertedRegexLiteral(n, _): -// // We disable nesting for converted AST trees, as literals do not nest -// // captures. This includes literals nested in a DSL. -// return addCaptures(of: n, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) -// - case .matcher: - break - - case .customCharacterClass, .atom, .trivia, .empty, - .quotedLiteral, .consumer, .characterPredicate: - break - } - } - - static func build(_ dsl: DSLTree) -> CaptureList { - var builder = Self() - builder.captures.append( - .init(type: dsl.root.wholeMatchType, optionalDepth: 0, visibleInTypedOutput: true, .fake)) - builder.addCaptures(of: dsl.root, optionalNesting: .init(canNest: true), visibleInTypedOutput: true) - return builder.captures - } - mutating func addCaptures( in list: inout ArraySlice, optionalNesting nesting: OptionalNesting, visibleInTypedOutput: Bool ) { guard let node = list.popFirst() else { return } switch node { - case let .orderedChoice(children): - for _ in 0.. CaptureList { var builder = Self() builder.captures.append( - .init(type: dsl.first.wholeMatchType, optionalDepth: 0, visibleInTypedOutput: true, .fake)) + .init(type: dsl.wholeMatchType, optionalDepth: 0, visibleInTypedOutput: true, .fake)) var nodes = dsl.nodes[...] builder.addCaptures(in: &nodes, optionalNesting: .init(canNest: true), visibleInTypedOutput: true) return builder.captures } } -extension DSLTree.Node { - /// Returns true if the node is output-forwarding, i.e. not defining its own - /// output but forwarding its only child's output. - var isOutputForwarding: Bool { - switch self { - case .nonCapturingGroup, .ignoreCapturesInTypedOutput: - return true - case .orderedChoice, .concatenation, .capture, - .conditional, .quantification, .customCharacterClass, .atom, - .trivia, .empty, .quotedLiteral, .limitCaptureNesting, - .consumer, .absentFunction, - .characterPredicate, .matcher: - return false - } - } - - /// Returns the output-defining node, peering through any output-forwarding - /// nodes. - var outputDefiningNode: Self { - if isOutputForwarding { - assert(children.count == 1) - return children[0].outputDefiningNode - } - return self - } - - /// Returns the type of the whole match, i.e. `.0` element type of the output. - var wholeMatchType: Any.Type { - if case .matcher(let type, _) = outputDefiningNode { - return type - } - return Substring.self - } -} - extension DSLList { - - /// Returns the output-defining node, peering through any output-forwarding - /// nodes. - var outputDefiningNode: DSLTree.Node? { - nodes.first(where: { !$0.isOutputForwarding }) - } - /// Returns the type of the whole match, i.e. `.0` element type of the output. var wholeMatchType: Any.Type { - if case .matcher(let type, _) = outputDefiningNode { - return type - } - return Substring.self + nodes.wholeMatchType } } -extension DSLTree.Node { - /// Implementation for `canOnlyMatchAtStart`, which maintains the option - /// state. - /// - /// For a given specific node, this method can return one of three values: - /// - /// - `true`: This node is guaranteed to match only at the start of a subject. - /// - `false`: This node can match anywhere in the subject. - /// - `nil`: This node is inconclusive about where it can match. - /// - /// In particular, non-required groups and option-setting groups are - /// inconclusive about where they can match. - private func _canOnlyMatchAtStartImpl(_ options: inout MatchingOptions) -> Bool? { - switch self { - // Defining cases - case .atom(.assertion(.startOfSubject)): - return true - case .atom(.assertion(.caretAnchor)): - return !options.anchorsMatchNewlines - - // Changing options doesn't determine `true`/`false`. - case .atom(.changeMatchingOptions(let sequence)): - options.apply(sequence.ast) - return nil - - // Any other atom or consuming node returns `false`. - case .atom, .customCharacterClass, .quotedLiteral: - return false - - // Trivia/empty have no effect. - case .trivia, .empty: - return nil - - // In an alternation, all of its children must match only at start. - case .orderedChoice(let children): - return children.allSatisfy { $0._canOnlyMatchAtStartImpl(&options) == true } - - // In a concatenation, the first definitive child provides the answer. - case .concatenation(let children): - for child in children { - if let result = child._canOnlyMatchAtStartImpl(&options) { - return result - } - } - return false - - // Groups (and other parent nodes) defer to the child. - case .nonCapturingGroup(let kind, let child): - // Don't let a negative lookahead affect this - need to continue to next sibling - if kind.isNegativeLookahead { - return nil - } - options.beginScope() - defer { options.endScope() } - if case .changeMatchingOptions(let sequence) = kind.ast { - options.apply(sequence) - } - return child._canOnlyMatchAtStartImpl(&options) - case .capture(_, _, let child, _): - options.beginScope() - defer { options.endScope() } - return child._canOnlyMatchAtStartImpl(&options) - case .ignoreCapturesInTypedOutput(let child), .limitCaptureNesting(let child): - return child._canOnlyMatchAtStartImpl(&options) - - // A quantification that doesn't require its child to exist can still - // allow a start-only match. (e.g. `/(foo)?^bar/`) - case .quantification(let amount, _, let child): - return amount.requiresAtLeastOne - ? child._canOnlyMatchAtStartImpl(&options) - : nil - - // For conditional nodes, both sides must require matching at start. - case .conditional(_, let child1, let child2): - return child1._canOnlyMatchAtStartImpl(&options) == true - && child2._canOnlyMatchAtStartImpl(&options) == true - - // Extended behavior isn't known, so we return `false` for safety. - case .consumer, .matcher, .characterPredicate, .absentFunction: - return false - } - } - - /// Returns a Boolean value indicating whether the regex with this node as - /// the root can _only_ match at the start of a subject. - /// - /// For example, these regexes can only match at the start of a subject: - /// - /// - `/^foo/` - /// - `/(^foo|^bar)/` (both sides of the alternation start with `^`) - /// - /// These can match other places in a subject: - /// - /// - `/(^foo)?bar/` (`^` is in an optional group) - /// - `/(^foo|bar)/` (only one side of the alternation starts with `^`) - /// - `/(?m)^foo/` (`^` means "the start of a line" due to `(?m)`) - internal func canOnlyMatchAtStart() -> Bool { - var options = MatchingOptions() - return _canOnlyMatchAtStartImpl(&options) ?? false - } -} - -// MARK: Required first and last atoms - -extension DSLTree.Node { - private func _requiredAtomImpl(forward: Bool) -> DSLTree.Atom?? { - switch self { - case .atom(let atom): - return switch atom { - case .changeMatchingOptions: - nil +extension Sequence { + var wholeMatchType: Any.Type { + Loop: + for node in self { + switch node { + case .nonCapturingGroup, .ignoreCapturesInTypedOutput: + continue Loop + case .matcher(let type, _): + return type default: - atom + break Loop } - - // In a concatenation, the first definitive child provides the answer. - case .concatenation(let children): - if forward { - for child in children { - if let result = child._requiredAtomImpl(forward: forward) { - return result - } - } - } else { - for child in children.reversed() { - if let result = child._requiredAtomImpl(forward: forward) { - return result - } - } - } - return nil - - // For a quoted literal, we can look at the first char - // TODO: matching semantics??? - case .quotedLiteral(let str): - return str.first.map(DSLTree.Atom.char) - - // TODO: custom character classes could/should participate here somehow - case .customCharacterClass: - return .some(nil) - - // Trivia/empty have no effect. - case .trivia, .empty: - return nil - - // For alternation and conditional, no required first (this could change - // if we identify the _same_ required first atom across all possibilities). - case .orderedChoice, .conditional: - return .some(nil) - - // Groups (and other parent nodes) defer to the child. - case .nonCapturingGroup(_, let child), .capture(_, _, let child, _), - .ignoreCapturesInTypedOutput(let child), - .limitCaptureNesting(let child): - return child._requiredAtomImpl(forward: forward) - - // A quantification that doesn't require its child to exist can still - // allow a start-only match. (e.g. `/(foo)?^bar/`) - case .quantification(let amount, _, let child): - return amount.requiresAtLeastOne - ? child._requiredAtomImpl(forward: forward) - : .some(nil) - - // Extended behavior isn't known, so we return `false` for safety. - case .consumer, .matcher, .characterPredicate, .absentFunction: - return .some(nil) } - } - - internal func requiredFirstAtom() -> DSLTree.Atom? { - self._requiredAtomImpl(forward: true) ?? nil - } - - internal func requiredLastAtom() -> DSLTree.Atom? { - self._requiredAtomImpl(forward: false) ?? nil + return Substring.self } } +// MARK: Required first and last atoms private func _requiredAtomImpl(_ list: inout ArraySlice) -> DSLTree.Atom?? { guard let node = list.popFirst() else { @@ -1111,8 +655,8 @@ private func _requiredAtomImpl(_ list: inout ArraySlice) -> DSLTre } // In a concatenation, the first definitive child provides the answer. - case .concatenation(let children): - for _ in 0..) -> DSLTre // A quantification that doesn't require its child to exist can still // allow a start-only match. (e.g. `/(foo)?^bar/`) - case .quantification(let amount, _, _): + case .quantification(let amount, _): return amount.requiresAtLeastOne ? _requiredAtomImpl(&list) : .some(nil) @@ -1166,44 +710,6 @@ internal func requiredFirstAtom(_ list: inout ArraySlice) -> DSLTr // include symbols from implementation-only dependencies. extension DSLTree { - var captureList: CaptureList { .Builder.build(self) } - - /// Presents a wrapped version of `DSLTree.Node` that can provide an internal - /// `_TreeNode` conformance. - struct _Tree: _TreeNode { - var node: DSLTree.Node - - init(_ node: DSLTree.Node) { - self.node = node - } - - var children: [_Tree]? { - switch node { - - case let .orderedChoice(v): return v.map(_Tree.init) - case let .concatenation(v): return v.map(_Tree.init) - - case let .capture(_, _, n, _): return [_Tree(n)] - case let .nonCapturingGroup(_, n): return [_Tree(n)] - case let .quantification(_, _, n): return [_Tree(n)] - case let .ignoreCapturesInTypedOutput(n): return [_Tree(n)] - case let .limitCaptureNesting(n): - // This is a transparent wrapper - return _Tree(n).children - - case let .conditional(_, t, f): return [_Tree(t), _Tree(f)] - - case .trivia, .empty, .quotedLiteral, - .consumer, .matcher, .characterPredicate, - .customCharacterClass, .atom: - return [] - - case let .absentFunction(abs): - return abs.ast.children.map(\.dslTreeNode).map(_Tree.init) - } - } - } - @_spi(RegexBuilder) public enum _AST { @_spi(RegexBuilder) @@ -1342,8 +848,7 @@ extension DSLTree.Node { @available(SwiftStdlib 5.7, *) static func repeating( _ range: Range, - _ behavior: RegexRepetitionBehavior?, - _ node: DSLTree.Node + _ behavior: RegexRepetitionBehavior? ) -> DSLTree.Node { // TODO: Throw these as errors precondition(range.lowerBound >= 0, "Cannot specify a negative lower bound") @@ -1361,23 +866,23 @@ extension DSLTree.Node { if range.upperBound == Int.max { switch lower { case 0: // 0... - return .quantification(.zeroOrMore, kind, node) + return .quantification(.zeroOrMore, kind) case 1: // 1... - return .quantification(.oneOrMore, kind, node) + return .quantification(.oneOrMore, kind) default: // n... - return .quantification(.nOrMore(lower), kind, node) + return .quantification(.nOrMore(lower), kind) } } if range.count == 1 { // ..<1 or ...0 or any range with count == 1 // Note: `behavior` is ignored in this case - return .quantification(.exactly(lower), .default, node) + return .quantification(.exactly(lower), .default) } switch lower { case 0: // 0.. Regex { // Don't wrap `child` again if it's a leaf node. child.regex.list.hasChildren - ? child.regex.prepending(.ignoreCapturesInTypedOutput(TEMP_FAKE_NODE)) as Regex + ? child.regex.prepending(.ignoreCapturesInTypedOutput) as Regex : .init(list: child.regex.program.list) } @@ -107,7 +107,7 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior? = nil ) -> Regex { let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default - return component.regex.prepending(.quantification(.zeroOrOne, kind, TEMP_FAKE_NODE)) + return component.regex.prepending(.quantification(.zeroOrOne, kind)) } @available(SwiftStdlib 5.7, *) @@ -116,7 +116,7 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior? = nil ) -> Regex { let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default - return component.regex.prepending(.quantification(.zeroOrMore, kind, TEMP_FAKE_NODE)) + return component.regex.prepending(.quantification(.zeroOrMore, kind)) } @available(SwiftStdlib 5.7, *) @@ -125,7 +125,7 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior? = nil ) -> Regex { let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default - return component.regex.prepending(.quantification(.oneOrMore, kind, TEMP_FAKE_NODE)) + return component.regex.prepending(.quantification(.oneOrMore, kind)) } @available(SwiftStdlib 5.7, *) @@ -133,7 +133,7 @@ public struct _RegexFactory { _ count: Int, _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.quantification(.exactly(count), .default, TEMP_FAKE_NODE)) + component.regex.prepending(.quantification(.exactly(count), .default)) } @available(SwiftStdlib 5.7, *) @@ -142,14 +142,14 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior?, _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.repeating(range, behavior, TEMP_FAKE_NODE)) + component.regex.prepending(.repeating(range, behavior)) } @available(SwiftStdlib 5.7, *) public func atomicNonCapturing( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.nonCapturingGroup(.atomicNonCapturing, TEMP_FAKE_NODE)) + component.regex.prepending(.nonCapturingGroup(.atomicNonCapturing)) } @_spi(RegexBuilder) @@ -157,7 +157,7 @@ public struct _RegexFactory { public func lookaheadNonCapturing( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.nonCapturingGroup(.lookahead, TEMP_FAKE_NODE)) + component.regex.prepending(.nonCapturingGroup(.lookahead)) } @_spi(RegexBuilder) @@ -165,21 +165,21 @@ public struct _RegexFactory { public func negativeLookaheadNonCapturing( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.nonCapturingGroup(.negativeLookahead, TEMP_FAKE_NODE)) + component.regex.prepending(.nonCapturingGroup(.negativeLookahead)) } @available(SwiftStdlib 5.7, *) public func orderedChoice( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.orderedChoice([TEMP_FAKE_NODE])) + component.regex.prepending(.orderedChoice(1)) } @available(SwiftStdlib 5.7, *) public func capture( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.capture(TEMP_FAKE_NODE)) + component.regex.prepending(.capture()) } @available(SwiftStdlib 5.7, *) @@ -187,7 +187,7 @@ public struct _RegexFactory { _ component: some RegexComponent, _ reference: Int ) -> Regex { - component.regex.prepending(.capture(reference: ReferenceID(reference), TEMP_FAKE_NODE)) + component.regex.prepending(.capture(reference: ReferenceID(reference))) } @available(SwiftStdlib 5.7, *) @@ -199,7 +199,6 @@ public struct _RegexFactory { component.regex.prepending( .capture( reference: reference.map { ReferenceID($0) }, - TEMP_FAKE_NODE, CaptureTransform(transform) )) } @@ -213,7 +212,6 @@ public struct _RegexFactory { component.regex.prepending( .capture( reference: reference.map { ReferenceID($0) }, - TEMP_FAKE_NODE, CaptureTransform(transform) )) } diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 34cc20ad7..bbf41d3b3 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -129,7 +129,7 @@ extension StringCapture { // TODO: Move `flatCaptureTest`s over here too... func compile(_ ast: AST) -> MEProgram { - try! Compiler(tree: ast.dslTree).emit() + try! Compiler(ast: ast).emit() } func captureTest( diff --git a/Tests/RegexTests/OptimizationTests.swift b/Tests/RegexTests/OptimizationTests.swift index a60d9bf5f..d40c8c8ac 100644 --- a/Tests/RegexTests/OptimizationTests.swift +++ b/Tests/RegexTests/OptimizationTests.swift @@ -37,7 +37,7 @@ import Testing list.autoPossessify() for node in list.nodes { switch node { - case .quantification(_, let kind, _): + case .quantification(_, let kind): #expect( kind.isExplicit && kind.quantificationKind?.ast == .possessive, "Expected possessification in '\(pattern._literalPattern!)'") @@ -57,7 +57,7 @@ import Testing list.autoPossessify() for node in list.nodes { switch node { - case .quantification(_, let kind, _): + case .quantification(_, let kind): #expect( kind.quantificationKind?.ast != .possessive, "Unexpected possessification in '\(pattern._literalPattern!)'") diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 19ab4c35c..30838ace8 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -236,13 +236,13 @@ extension RenderDSLTests { try testConversion(#"a\u{301}"#, #""" Regex { - "a\u{301}" + "aฬ" } """#) try testConversion(#"(?x) a \u{301}"#, #""" Regex { - "a\u{301}" + "aฬ" } """#) @@ -254,16 +254,16 @@ extension RenderDSLTests { try testConversion(#"๐Ÿ‘จ\u{200D}๐Ÿ‘จ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ"#, #""" Regex { - "๐Ÿ‘จ\u{200D}๐Ÿ‘จ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ" + "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ" } """#) try testConversion(#"(๐Ÿ‘จ\u{200D}๐Ÿ‘จ)\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ"#, #""" Regex { Capture { - "๐Ÿ‘จ\u{200D}๐Ÿ‘จ" + "๐Ÿ‘จโ€๐Ÿ‘จ" } - "\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ" + "โ€๐Ÿ‘งโ€๐Ÿ‘ฆ" } """#) @@ -272,7 +272,7 @@ extension RenderDSLTests { Regex { "abcd" Regex { - "e\u{301}" + "eฬ" One(.digit) } } @@ -284,12 +284,9 @@ extension RenderDSLTests { } """#) - // TODO: We might want to consider preserving scalar sequences in the DSL, - // and allowing them to merge with other concatenations. try testConversion(#"\u{A B C}\u{d}efg"#, #""" Regex { - "\u{A}\u{B}\u{C}" - "\u{D}efg" + "\u{A}\u{B}\u{C}\u{D}efg" } """#) @@ -303,6 +300,60 @@ extension RenderDSLTests { """#) } + func testQuantifiers() throws { + try testConversion(#"a+b*c?d{1,}e{,3}f{2,4}g{5}"#, #""" + Regex { + OneOrMore { + "a" + } + ZeroOrMore { + "b" + } + Optionally { + "c" + } + Repeat(1...) { + "d" + } + Repeat(...3) { + "e" + } + Repeat(2...4) { + "f" + } + Repeat(count: 5) { + "g" + } + } + """#) + + try testConversion(#"(?:(?:(?:(?:(?:(?:a+b)*c)?d){1,}e){,3}f){2,4}g){5}"#, #""" + Regex { + Repeat(count: 5) { + Repeat(2...4) { + Repeat(...3) { + Repeat(1...) { + Optionally { + ZeroOrMore { + OneOrMore { + "a" + } + "b" + } + "c" + } + "d" + } + "e" + } + "f" + } + "g" + } + } + """#) + } + func testCharacterClass() throws { try testConversion(#"[abc]+"#, #""" Regex { @@ -343,6 +394,30 @@ extension RenderDSLTests { ZeroOrMore(CharacterClass.anyOf("i").inverted) } """#) + + try testConversion(#"[a-z]+"#, #""" + Regex { + OneOrMore(("a"..."z")) + } + """#) + + try testConversion(#"[[a-z]&&[0-9]]+"#, #""" + Regex { + OneOrMore { + One(("a"..."z") + .intersection(("0"..."9"))) + } + } + """#) + + // Non-convertible elements in character class + try testConversion(#"[a-z\N{BEE}]+"#, #""" + Regex { + OneOrMore { + #/[a-z\N{BEE}]/# + } + } + """#) } func testChangeMatchingOptions() throws { From 97a5e75bfbbc97209f768f6ea46f350c00c52a81 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 9 Apr 2026 23:54:42 -0500 Subject: [PATCH 2/4] Reduce amount of \Q...\E quoting in _literalPattern This change switches to manually escape regex metacharacters, which lessens the circumstances that use the more cumbersome regex quoting syntax. --- Sources/_StringProcessing/LiteralPrinter.swift | 10 +++++++--- Tests/RegexTests/LiteralPrinterTests.swift | 6 ++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/Sources/_StringProcessing/LiteralPrinter.swift b/Sources/_StringProcessing/LiteralPrinter.swift index e841e4b5d..f13bb61cb 100644 --- a/Sources/_StringProcessing/LiteralPrinter.swift +++ b/Sources/_StringProcessing/LiteralPrinter.swift @@ -389,7 +389,7 @@ extension LiteralPrinter { } func prepareQuotedLiteral(_ literal: String) -> String { - if options.usesExtendedWhitespace || literal.containsRegexMetaCharacters { + if options.usesExtendedWhitespace { return #"\Q\#(literal)\E"# } else { return literal.escapingConfusableCharacters() @@ -474,8 +474,12 @@ extension String { func escapingConfusableCharacters() -> String { reduce(into: "") { result, ch in for scalar in ch.unicodeScalars { - if scalar.isPrintableASCII { - result.append(Character(scalar)) + let ch = Character(scalar) + if ch.isRegexMetaCharacter { + result.append("\\") + result.append(ch) + } else if scalar.isPrintableASCII { + result.append(ch) } else { result.append(scalar.escapedString) } diff --git a/Tests/RegexTests/LiteralPrinterTests.swift b/Tests/RegexTests/LiteralPrinterTests.swift index 69f273fd5..2b109b321 100644 --- a/Tests/RegexTests/LiteralPrinterTests.swift +++ b/Tests/RegexTests/LiteralPrinterTests.swift @@ -40,6 +40,12 @@ extension RegexTests { XCTAssertEqual("\(printableRegex)", regexString) } + func testLiteral169973074() throws { + let regexString = #"^id\-\d+"# + let regex = try Regex(regexString) + _literalTest(regex, expected: regexString) + } + func testUnicodeEscapes() throws { let regex0 = #/[a]\u0301/# _literalTest(regex0, expected: #"[a]\u0301"#) From 5f5fc9ddd880a8165bce92c6d431a9bae4b63ffa Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 15 Apr 2026 17:47:39 -0500 Subject: [PATCH 3/4] Revert test fixtures --- Tests/RegexTests/RenderDSLTests.swift | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 30838ace8..b6b249242 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -236,13 +236,13 @@ extension RenderDSLTests { try testConversion(#"a\u{301}"#, #""" Regex { - "aฬ" + "a\u{301}" } """#) try testConversion(#"(?x) a \u{301}"#, #""" Regex { - "aฬ" + "a\u{301}" } """#) @@ -254,16 +254,16 @@ extension RenderDSLTests { try testConversion(#"๐Ÿ‘จ\u{200D}๐Ÿ‘จ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ"#, #""" Regex { - "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ" + "๐Ÿ‘จ\u{200D}๐Ÿ‘จ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ" } """#) try testConversion(#"(๐Ÿ‘จ\u{200D}๐Ÿ‘จ)\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ"#, #""" Regex { Capture { - "๐Ÿ‘จโ€๐Ÿ‘จ" + "๐Ÿ‘จ\u{200D}๐Ÿ‘จ" } - "โ€๐Ÿ‘งโ€๐Ÿ‘ฆ" + "\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ" } """#) @@ -272,7 +272,7 @@ extension RenderDSLTests { Regex { "abcd" Regex { - "eฬ" + "e\u{301}" One(.digit) } } From a99a5e543d2c2c6b6f3a7ffc4efc9c5d62d92fde Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 15 Apr 2026 17:49:33 -0500 Subject: [PATCH 4/4] Remove commented-out code --- Sources/_StringProcessing/PrintAsPattern.swift | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index f29b06419..b2cc467a3 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -191,18 +191,6 @@ extension PrettyPrinter { case .empty: print("") -// case let .quotedLiteral(v): -// let str = v._quoted.reduce(into: "") { result, ch in -// for scalar in ch.unicodeScalars { -// switch scalar.properties.generalCategory { -// case .control: -// result.append(#"\u{\#(String(scalar.value, radix: 16, uppercase: true))}"#) -// default: -// result.append(Character(scalar)) -// } -// } -// } -// print(str) case let .quotedLiteral(v, display: d): if let display = d { print(display._bareQuoted)