diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 83c014d2a..febdac7d1 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -320,7 +320,8 @@ extension RegexValidator { func validateGroup(_ group: AST.Group) throws { let kind = group.kind switch kind.value { - case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead: + case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead, + .atomicNonCapturing: break case .balancedCapture: @@ -331,9 +332,6 @@ extension RegexValidator { // We need to figure out how these interact with typed captures. throw error(.unsupported("branch reset group"), at: kind.location) - case .atomicNonCapturing: - throw error(.unsupported("atomic group"), at: kind.location) - case .nonAtomicLookahead: throw error(.unsupported("non-atomic lookahead"), at: kind.location) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 3a91b6c67..a92fa8837 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -295,7 +295,7 @@ extension Compiler.ByteCodeGen { save(restoringAt: success) save(restoringAt: intercept) // failure restores at intercept - clearSavePoint // remove intercept + clearThrough(intercept) // remove intercept and any leftovers from : clearSavePoint // remove success fail // positive->success, negative propagates @@ -313,7 +313,7 @@ extension Compiler.ByteCodeGen { builder.buildSave(success) builder.buildSave(intercept) try emitNode(child) - builder.buildClear() + builder.buildClearThrough(intercept) if !positive { builder.buildClear() } @@ -328,6 +328,38 @@ extension Compiler.ByteCodeGen { builder.label(success) } + mutating func emitAtomicNoncapturingGroup( + _ child: DSLTree.Node + ) throws { + /* + save(continuingAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from + fail // ->success + intercept: + clearSavePoint // remove success + fail // propagate failure + success: + ... + */ + + let intercept = builder.makeAddress() + let success = builder.makeAddress() + + builder.buildSaveAddress(success) + builder.buildSave(intercept) + try emitNode(child) + builder.buildClearThrough(intercept) + builder.buildFail() + + builder.label(intercept) + builder.buildClear() + builder.buildFail() + + builder.label(success) + } + mutating func emitMatcher( _ matcher: @escaping _MatcherInterface, into capture: CaptureRegister? = nil @@ -393,6 +425,9 @@ extension Compiler.ByteCodeGen { } options.apply(optionSequence) try emitNode(child) + + case .atomicNonCapturing: + try emitAtomicNoncapturingGroup(child) default: // FIXME: Other kinds... diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index ff28ee9e2..9144c031f 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -228,6 +228,13 @@ extension Instruction { /// Precondition: There is a save point to remove case clear + /// Remove save points up to and including the operand + /// + /// Operand: instruction address to look for + /// + /// Precondition: The operand is in the save point list + case clearThrough + /// View the most recently saved point /// /// UNIMPLEMENTED diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index f706c0471..e1d68e0ad 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -158,6 +158,10 @@ extension MEProgram.Builder { mutating func buildClear() { instructions.append(.init(.clear)) } + mutating func buildClearThrough(_ t: AddressToken) { + instructions.append(.init(.clearThrough)) + fixup(to: t) + } mutating func buildRestore() { instructions.append(.init(.restore)) } @@ -322,7 +326,7 @@ extension MEProgram.Builder { case .condBranchZeroElseDecrement: payload = .init(addr: addr, int: inst.payload.int) - case .branch, .save, .saveAddress, .call: + case .branch, .save, .saveAddress, .call, .clearThrough: payload = .init(addr: addr) case .splitSaving: diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 8f777ad33..13d63ee0d 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -204,6 +204,17 @@ extension Processor { } } + mutating func clearThrough(_ address: InstructionAddress) { + while let sp = savePoints.popLast() { + if sp.pc == address { + controller.step() + return + } + } + // TODO: What should we do here? + fatalError("Invalid code: Tried to clear save points when empty") + } + mutating func cycle() { _checkInvariants() assert(state == .inProgress) @@ -288,9 +299,13 @@ extension Processor { if let _ = savePoints.popLast() { controller.step() } else { - fatalError("TODO: What should we do here?") + // TODO: What should we do here? + fatalError("Invalid code: Tried to clear save points when empty") } + case .clearThrough: + clearThrough(payload.addr) + case .peek: fatalError() diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 5a88adf6b..d7a1ce6ee 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -467,6 +467,29 @@ class RegexDSLTests: XCTestCase { XCTAssertEqual("ab12".firstMatch(of: octoDecimalRegex)!.output.1, 61904) } + func testLocal() throws { + try _testDSLCaptures( + ("aaaaa", nil), + matchType: Substring.self, ==) + { + Local { + OneOrMore("a") + } + "a" + } + + try _testDSLCaptures( + ("aa", "aa"), + ("aaa", nil), + matchType: Substring.self, ==) + { + Local { + OneOrMore("a", .reluctant) + } + "a" + } + } + func testAssertions() throws { try _testDSLCaptures( ("aaaaab", "aaaaab"), diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 95b865ed9..5b08093e6 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -889,8 +889,7 @@ extension RegexTests { input: "Price: 100 dollars", match: nil) firstMatchTest( #"(?=\d+ dollars)\d+"#, - input: "Price: 100 dollars", match: "100", - xfail: true) // TODO + input: "Price: 100 dollars", match: "100") firstMatchTest( #"\d+(*pla: dollars)"#, @@ -915,6 +914,14 @@ extension RegexTests { #"\d+(*negative_lookahead: dollars)"#, input: "Price: 100 pesos", match: "100") + // More complex lookaheads + firstMatchTests( + #"(?=.*e)(?=.*o)(?!.*z)."#, + (input: "hello", match: "h"), + (input: "hzello", match: "e"), + (input: "hezllo", match: nil), + (input: "helloz", match: nil)) + firstMatchTest( #"(?<=USD)\d+"#, input: "Price: USD100", match: "100", xfail: true) firstMatchTest( @@ -1046,14 +1053,93 @@ extension RegexTests { firstMatchTest( #"(?:a|.b)c"#, input: "123abcacxyz", match: "abc") firstMatchTest( - #"(?>a|.b)c"#, input: "123abcacxyz", match: "ac", xfail: true) + #"(?>a|.b)c"#, input: "123abcacxyz", match: "ac") firstMatchTest( - "(*atomic:a|.b)c", input: "123abcacxyz", match: "ac", xfail: true) + "(*atomic:a|.b)c", input: "123abcacxyz", match: "ac") firstMatchTest( #"(?:a+)[a-z]c"#, input: "123aacacxyz", match: "aac") firstMatchTest( - #"(?>a+)[a-z]c"#, input: "123aacacxyz", match: "ac", xfail: true) + #"(?>a+)[a-z]c"#, input: "123aacacxyz", match: nil) + + // Atomicity should stay in the atomic group + firstMatchTest( + #"(?:(?>a)|.b)c"#, input: "123abcacxyz", match: "abc") + + // Quantifier behavior inside atomic groups + + // (?:a+?) matches as few 'a's as possible, after matching the first + // (?>a+?) always matches exactly one 'a' + firstMatchTests( + #"^(?:a+?)a$"#, + (input: "a", match: nil), + (input: "aa", match: "aa"), + (input: "aaa", match: "aaa")) + firstMatchTests( + #"^(?>a+?)a$"#, + (input: "a", match: nil), + (input: "aa", match: "aa"), + (input: "aaa", match: nil)) + + // (?:a?+) and (?>a?+) are equivalent: they match one 'a' if available + firstMatchTests( + #"^(?:a?+)a$"#, + (input: "a", match: nil), + xfail: true) + firstMatchTests( + #"^(?:a?+)a$"#, + (input: "aa", match: "aa"), + (input: "aaa", match: nil)) + firstMatchTests( + #"^(?>a?+)a$"#, + (input: "a", match: nil), + (input: "aa", match: "aa"), + (input: "aaa", match: nil)) + // Capture behavior in non-atomic vs atomic groups + firstMatchTests( + #"(\d+)\w+\1"#, + (input: "123x12", match: "123x12"), // `\w+` matches "3x" in this case + (input: "23x23", match: "23x23"), + (input: "123x23", match: "23x23")) + firstMatchTests( + #"(?>(\d+))\w+\1"#, + (input: "123x12", match: nil)) + firstMatchTests( + #"(?>(\d+))\w+\1"#, + (input: "23x23", match: "23x23"), + (input: "123x23", match: "23x23"), + xfail: true) + + // Backreferences in lookaheads + firstMatchTests( + #"^(?=.*(.)(.)\2\1).+$"#, + (input: "abbba", match: nil), + (input: "ABBA", match: "ABBA"), + (input: "defABBAdef", match: "defABBAdef")) + firstMatchTests( + #"^(?=.*(.)(.)\2\1).+\2$"#, + (input: "abbba", match: nil), + (input: "ABBA", match: nil), + (input: "defABBAdef", match: nil)) + // FIXME: Backreferences don't escape positive lookaheads + firstMatchTests( + #"^(?=.*(.)(.)\2\1).+\2$"#, + (input: "ABBAB", match: "ABBAB"), + (input: "defABBAdefB", match: "defABBAdefB"), + xfail: true) + + firstMatchTests( + #"^(?!.*(.)(.)\2\1).+$"#, + (input: "abbba", match: "abbba"), + (input: "ABBA", match: nil), + (input: "defABBAdef", match: nil)) + // Backreferences don't escape negative lookaheads; + // matching only proceeds when the lookahead fails + firstMatchTests( + #"^(?!.*(.)(.)\2\1).+\2$"#, + (input: "abbba", match: nil), + (input: "abbbab", match: nil), + (input: "ABBAB", match: nil)) // TODO: Test example where non-atomic is significant firstMatchTest( diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 20067ac20..f581a8beb 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -940,10 +940,10 @@ extension RegexTests { concat("a", nonCaptureReset("b"), "c"), throwsError: .unsupported) parseTest( #"a(?>b)c"#, - concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported) + concat("a", atomicNonCapturing("b"), "c")) parseTest( "a(*atomic:b)c", - concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported) + concat("a", atomicNonCapturing("b"), "c")) parseTest("a(?=b)c", concat("a", lookahead("b"), "c")) parseTest("a(*pla:b)c", concat("a", lookahead("b"), "c"))