From bd3ad7daed854a47d6c800747a73611780f230fa Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 9 Jun 2022 16:43:50 -0500 Subject: [PATCH 1/7] Add a `clearThrough` instruction This will let us fix lookahead assertions that have leftover save points in the subpattern on success, and also allow us to implement atomic groups. --- .../_StringProcessing/Engine/Instruction.swift | 7 +++++++ Sources/_StringProcessing/Engine/MEBuilder.swift | 6 +++++- Sources/_StringProcessing/Engine/Processor.swift | 15 ++++++++++++++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index ff28ee9e2..9144c031f 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -228,6 +228,13 @@ extension Instruction { /// Precondition: There is a save point to remove case clear + /// Remove save points up to and including the operand + /// + /// Operand: instruction address to look for + /// + /// Precondition: The operand is in the save point list + case clearThrough + /// View the most recently saved point /// /// UNIMPLEMENTED diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index f706c0471..e1d68e0ad 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -158,6 +158,10 @@ extension MEProgram.Builder { mutating func buildClear() { instructions.append(.init(.clear)) } + mutating func buildClearThrough(_ t: AddressToken) { + instructions.append(.init(.clearThrough)) + fixup(to: t) + } mutating func buildRestore() { instructions.append(.init(.restore)) } @@ -322,7 +326,7 @@ extension MEProgram.Builder { case .condBranchZeroElseDecrement: payload = .init(addr: addr, int: inst.payload.int) - case .branch, .save, .saveAddress, .call: + case .branch, .save, .saveAddress, .call, .clearThrough: payload = .init(addr: addr) case .splitSaving: diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 8f777ad33..8da8d362b 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -215,6 +215,7 @@ extension Processor { } let (opcode, payload) = fetch().destructure + OpCodeSwitch: switch opcode { case .invalid: fatalError("Invalid program") @@ -288,9 +289,21 @@ extension Processor { if let _ = savePoints.popLast() { controller.step() } else { - fatalError("TODO: What should we do here?") + // TODO: What should we do here? + fatalError("Invalid code: Tried to clear save points when empty") } + case .clearThrough: + let addr = payload.addr + while let sp = savePoints.popLast() { + if sp.pc == addr { + controller.step() + break OpCodeSwitch + } + } + // TODO: What should we do here? + fatalError("Invalid code: Tried to clear save points when empty") + case .peek: fatalError() From e08bb64d977f285eac3a9cf34dc89d31fd2c0669 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 9 Jun 2022 16:49:06 -0500 Subject: [PATCH 2/7] Fix lookaheads with quantifiers On success, the subpatterns in lookaheads like (?=.*e) had a save point that persisted, causing the logic in the lookahead group to be invalid. --- Sources/_StringProcessing/ByteCodeGen.swift | 4 ++-- Tests/RegexTests/MatchTests.swift | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 3a91b6c67..039113110 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -295,7 +295,7 @@ extension Compiler.ByteCodeGen { save(restoringAt: success) save(restoringAt: intercept) // failure restores at intercept - clearSavePoint // remove intercept + clearThrough(intercept) // remove intercept and any leftovers from : clearSavePoint // remove success fail // positive->success, negative propagates @@ -313,7 +313,7 @@ extension Compiler.ByteCodeGen { builder.buildSave(success) builder.buildSave(intercept) try emitNode(child) - builder.buildClear() + builder.buildClearThrough(intercept) if !positive { builder.buildClear() } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 95b865ed9..079d3e589 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -889,8 +889,7 @@ extension RegexTests { input: "Price: 100 dollars", match: nil) firstMatchTest( #"(?=\d+ dollars)\d+"#, - input: "Price: 100 dollars", match: "100", - xfail: true) // TODO + input: "Price: 100 dollars", match: "100") firstMatchTest( #"\d+(*pla: dollars)"#, @@ -915,6 +914,14 @@ extension RegexTests { #"\d+(*negative_lookahead: dollars)"#, input: "Price: 100 pesos", match: "100") + // More complex lookaheads + firstMatchTest( + #"(?=.*e)(?=.*o)(?!.*z)"#, + input: "hello", match: "") + firstMatchTest( + #"^(?=.*e)(?=.*o)(?!.*h)"#, + input: "hello", match: nil) + firstMatchTest( #"(?<=USD)\d+"#, input: "Price: USD100", match: "100", xfail: true) firstMatchTest( From f3b993e56d9c68e20c5d7ff38b63ee5a280fbd57 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 9 Jun 2022 17:44:46 -0500 Subject: [PATCH 3/7] Implement atomic non-capturing group support In addition to the (?>...) syntax, this is what's underneath `Local`. --- Sources/_RegexParser/Regex/Parse/Sema.swift | 6 ++-- Sources/_StringProcessing/ByteCodeGen.swift | 35 +++++++++++++++++++++ Tests/RegexBuilderTests/RegexDSLTests.swift | 23 ++++++++++++++ Tests/RegexTests/MatchTests.swift | 17 ++++++++-- Tests/RegexTests/ParseTests.swift | 4 +-- 5 files changed, 76 insertions(+), 9 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 83c014d2a..febdac7d1 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -320,7 +320,8 @@ extension RegexValidator { func validateGroup(_ group: AST.Group) throws { let kind = group.kind switch kind.value { - case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead: + case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead, + .atomicNonCapturing: break case .balancedCapture: @@ -331,9 +332,6 @@ extension RegexValidator { // We need to figure out how these interact with typed captures. throw error(.unsupported("branch reset group"), at: kind.location) - case .atomicNonCapturing: - throw error(.unsupported("atomic group"), at: kind.location) - case .nonAtomicLookahead: throw error(.unsupported("non-atomic lookahead"), at: kind.location) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 039113110..a92fa8837 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -328,6 +328,38 @@ extension Compiler.ByteCodeGen { builder.label(success) } + mutating func emitAtomicNoncapturingGroup( + _ child: DSLTree.Node + ) throws { + /* + save(continuingAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from + fail // ->success + intercept: + clearSavePoint // remove success + fail // propagate failure + success: + ... + */ + + let intercept = builder.makeAddress() + let success = builder.makeAddress() + + builder.buildSaveAddress(success) + builder.buildSave(intercept) + try emitNode(child) + builder.buildClearThrough(intercept) + builder.buildFail() + + builder.label(intercept) + builder.buildClear() + builder.buildFail() + + builder.label(success) + } + mutating func emitMatcher( _ matcher: @escaping _MatcherInterface, into capture: CaptureRegister? = nil @@ -393,6 +425,9 @@ extension Compiler.ByteCodeGen { } options.apply(optionSequence) try emitNode(child) + + case .atomicNonCapturing: + try emitAtomicNoncapturingGroup(child) default: // FIXME: Other kinds... diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 5a88adf6b..d7a1ce6ee 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -467,6 +467,29 @@ class RegexDSLTests: XCTestCase { XCTAssertEqual("ab12".firstMatch(of: octoDecimalRegex)!.output.1, 61904) } + func testLocal() throws { + try _testDSLCaptures( + ("aaaaa", nil), + matchType: Substring.self, ==) + { + Local { + OneOrMore("a") + } + "a" + } + + try _testDSLCaptures( + ("aa", "aa"), + ("aaa", nil), + matchType: Substring.self, ==) + { + Local { + OneOrMore("a", .reluctant) + } + "a" + } + } + func testAssertions() throws { try _testDSLCaptures( ("aaaaab", "aaaaab"), diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 079d3e589..d7b48bf3c 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1053,14 +1053,25 @@ extension RegexTests { firstMatchTest( #"(?:a|.b)c"#, input: "123abcacxyz", match: "abc") firstMatchTest( - #"(?>a|.b)c"#, input: "123abcacxyz", match: "ac", xfail: true) + #"(?>a|.b)c"#, input: "123abcacxyz", match: "ac") firstMatchTest( - "(*atomic:a|.b)c", input: "123abcacxyz", match: "ac", xfail: true) + "(*atomic:a|.b)c", input: "123abcacxyz", match: "ac") firstMatchTest( #"(?:a+)[a-z]c"#, input: "123aacacxyz", match: "aac") firstMatchTest( - #"(?>a+)[a-z]c"#, input: "123aacacxyz", match: "ac", xfail: true) + #"(?>a+)[a-z]c"#, input: "123aacacxyz", match: nil) + + // Atomicity should stay in the atomic group + firstMatchTest( + #"(?:(?>a)|.b)c"#, input: "123abcacxyz", match: "abc") + // Quantifier behavior inside atomic + firstMatchTest( + #"^(?>a+?)a$"#, input: "aa", match: "aa") + firstMatchTest( + #"^(?>a+?)a$"#, input: "aaa", match: nil) + firstMatchTest( + #"(?>a++)a"#, input: "aaa", match: nil) // TODO: Test example where non-atomic is significant firstMatchTest( diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 20067ac20..f581a8beb 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -940,10 +940,10 @@ extension RegexTests { concat("a", nonCaptureReset("b"), "c"), throwsError: .unsupported) parseTest( #"a(?>b)c"#, - concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported) + concat("a", atomicNonCapturing("b"), "c")) parseTest( "a(*atomic:b)c", - concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported) + concat("a", atomicNonCapturing("b"), "c")) parseTest("a(?=b)c", concat("a", lookahead("b"), "c")) parseTest("a(*pla:b)c", concat("a", lookahead("b"), "c")) From 49a8ddae8f48bf73d5341a6d0572dc789fb4154c Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 13 Jun 2022 09:47:02 -0500 Subject: [PATCH 4/7] Add test cases for captures in lookaheads --- Tests/RegexTests/MatchTests.swift | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index d7b48bf3c..c8c486bdf 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1073,6 +1073,12 @@ extension RegexTests { firstMatchTest( #"(?>a++)a"#, input: "aaa", match: nil) + firstMatchTest( + #"(?>(\d+))\w+\1"#, input: "123x12", match: nil) + firstMatchTest( + #"(?>(\d+))\w+\1"#, input: "123x23", match: "23x23", + xfail: true) + // TODO: Test example where non-atomic is significant firstMatchTest( #"\d+(?* dollars)"#, From 0db7dc6dfbc82cd715646118009676260be2f156 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 16 Jun 2022 12:35:11 -0500 Subject: [PATCH 5/7] Address feedback, expand tests --- .../_StringProcessing/Engine/Processor.swift | 22 +++---- Tests/RegexTests/MatchTests.swift | 59 +++++++++++++------ 2 files changed, 54 insertions(+), 27 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 8da8d362b..13d63ee0d 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -204,6 +204,17 @@ extension Processor { } } + mutating func clearThrough(_ address: InstructionAddress) { + while let sp = savePoints.popLast() { + if sp.pc == address { + controller.step() + return + } + } + // TODO: What should we do here? + fatalError("Invalid code: Tried to clear save points when empty") + } + mutating func cycle() { _checkInvariants() assert(state == .inProgress) @@ -215,7 +226,6 @@ extension Processor { } let (opcode, payload) = fetch().destructure - OpCodeSwitch: switch opcode { case .invalid: fatalError("Invalid program") @@ -294,15 +304,7 @@ extension Processor { } case .clearThrough: - let addr = payload.addr - while let sp = savePoints.popLast() { - if sp.pc == addr { - controller.step() - break OpCodeSwitch - } - } - // TODO: What should we do here? - fatalError("Invalid code: Tried to clear save points when empty") + clearThrough(payload.addr) case .peek: fatalError() diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c8c486bdf..319eaf976 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -915,12 +915,12 @@ extension RegexTests { input: "Price: 100 pesos", match: "100") // More complex lookaheads - firstMatchTest( - #"(?=.*e)(?=.*o)(?!.*z)"#, - input: "hello", match: "") - firstMatchTest( - #"^(?=.*e)(?=.*o)(?!.*h)"#, - input: "hello", match: nil) + firstMatchTests( + #"(?=.*e)(?=.*o)(?!.*z)."#, + (input: "hello", match: "h"), + (input: "hzello", match: "e"), + (input: "hezllo", match: nil), + (input: "helloz", match: nil)) firstMatchTest( #"(?<=USD)\d+"#, input: "Price: USD100", match: "100", xfail: true) @@ -1065,18 +1065,43 @@ extension RegexTests { firstMatchTest( #"(?:(?>a)|.b)c"#, input: "123abcacxyz", match: "abc") - // Quantifier behavior inside atomic - firstMatchTest( - #"^(?>a+?)a$"#, input: "aa", match: "aa") - firstMatchTest( - #"^(?>a+?)a$"#, input: "aaa", match: nil) - firstMatchTest( - #"(?>a++)a"#, input: "aaa", match: nil) + // Quantifier behavior inside atomic groups + + // (?:a+?) matches as few 'a's as possible, after matching the first + // (?>a+?) always matches exactly one 'a' + firstMatchTests( + #"^(?:a+?)a$"#, + (input: "a", match: nil), + (input: "aa", match: "aa"), + (input: "aaa", match: "aaa")) + firstMatchTests( + #"^(?>a+?)a$"#, + (input: "a", match: nil), + (input: "aa", match: "aa"), + (input: "aaa", match: nil)) + + // (?:a?+) and (?>a?+) are equivalent: they match one 'a' if available + firstMatchTests( + #"^(?:a?+)a$"#, + (input: "a", match: nil), + xfail: true) + firstMatchTests( + #"^(?:a?+)a$"#, + (input: "aa", match: "aa"), + (input: "aaa", match: nil)) + firstMatchTests( + #"^(?>a?+)a$"#, + (input: "a", match: nil), + (input: "aa", match: "aa"), + (input: "aaa", match: nil)) - firstMatchTest( - #"(?>(\d+))\w+\1"#, input: "123x12", match: nil) - firstMatchTest( - #"(?>(\d+))\w+\1"#, input: "123x23", match: "23x23", + firstMatchTests( + #"(?>(\d+))\w+\1"#, + (input: "123x12", match: nil)) + firstMatchTests( + #"(?>(\d+))\w+\1"#, + (input: "23x23", match: "23x23"), + (input: "123x23", match: "23x23"), xfail: true) // TODO: Test example where non-atomic is significant From e61f5db599e67f7aec18379a24ddf45a503c86c1 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 16 Jun 2022 12:41:16 -0500 Subject: [PATCH 6/7] Add a non-atomic version of a test --- Tests/RegexTests/MatchTests.swift | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 319eaf976..c3006aeb5 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1095,6 +1095,12 @@ extension RegexTests { (input: "aa", match: "aa"), (input: "aaa", match: nil)) + // Capture behavior in non-atomic vs atomic groups + firstMatchTests( + #"(\d+)\w+\1"#, + (input: "123x12", match: "123x12"), // `\w+` matches "3x" in this case + (input: "23x23", match: "23x23"), + (input: "123x23", match: "23x23")) firstMatchTests( #"(?>(\d+))\w+\1"#, (input: "123x12", match: nil)) From e74a1130c16fd527ca56181e528ec8c4bb0535bf Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 16 Jun 2022 13:03:29 -0500 Subject: [PATCH 7/7] Add tests for backreferences in lookaheads --- Tests/RegexTests/MatchTests.swift | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c3006aeb5..5b08093e6 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1109,6 +1109,37 @@ extension RegexTests { (input: "23x23", match: "23x23"), (input: "123x23", match: "23x23"), xfail: true) + + // Backreferences in lookaheads + firstMatchTests( + #"^(?=.*(.)(.)\2\1).+$"#, + (input: "abbba", match: nil), + (input: "ABBA", match: "ABBA"), + (input: "defABBAdef", match: "defABBAdef")) + firstMatchTests( + #"^(?=.*(.)(.)\2\1).+\2$"#, + (input: "abbba", match: nil), + (input: "ABBA", match: nil), + (input: "defABBAdef", match: nil)) + // FIXME: Backreferences don't escape positive lookaheads + firstMatchTests( + #"^(?=.*(.)(.)\2\1).+\2$"#, + (input: "ABBAB", match: "ABBAB"), + (input: "defABBAdefB", match: "defABBAdefB"), + xfail: true) + + firstMatchTests( + #"^(?!.*(.)(.)\2\1).+$"#, + (input: "abbba", match: "abbba"), + (input: "ABBA", match: nil), + (input: "defABBAdef", match: nil)) + // Backreferences don't escape negative lookaheads; + // matching only proceeds when the lookahead fails + firstMatchTests( + #"^(?!.*(.)(.)\2\1).+\2$"#, + (input: "abbba", match: nil), + (input: "abbbab", match: nil), + (input: "ABBAB", match: nil)) // TODO: Test example where non-atomic is significant firstMatchTest(