diff --git a/Sources/_RegexParser/Regex/Parse/CaptureList.swift b/Sources/_RegexParser/Regex/Parse/CaptureList.swift index c8f3af561..2a5a47395 100644 --- a/Sources/_RegexParser/Regex/Parse/CaptureList.swift +++ b/Sources/_RegexParser/Regex/Parse/CaptureList.swift @@ -42,6 +42,21 @@ extension CaptureList { } } +extension CaptureList { + /// Retrieve the capture index of a given named capture, or `nil` if there is + /// no such capture. + public func indexOfCapture(named name: String) -> Int? { + // Named references are guaranteed to be unique for literal ASTs by Sema. + // The DSL tree does not use named references. + captures.indices.first(where: { captures[$0].name == name }) + } + + /// Whether the capture list has a given named capture. + public func hasCapture(named name: String) -> Bool { + indexOfCapture(named: name) != nil + } +} + // MARK: Generating from AST extension AST.Node { diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 382f78787..479604582 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -86,6 +86,7 @@ enum ParseError: Error, Hashable { case unsupported(String) case deprecatedUnicode(String) case invalidReference(Int) + case invalidNamedReference(String) case duplicateNamedCapture(String) case invalidCharacterClassRangeOperand case invalidQuantifierRange(Int, Int) @@ -211,6 +212,8 @@ extension ParseError: CustomStringConvertible { return "\(kind) is a deprecated Unicode property, and is not supported" case let .invalidReference(i): return "no capture numbered \(i)" + case let .invalidNamedReference(name): + return "no capture named '\(name)'" case let .duplicateNamedCapture(str): return "group named '\(str)' already exists" case let .invalidQuantifierRange(lhs, rhs): diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 395dae23b..19a650de7 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -72,20 +72,20 @@ extension RegexValidator { } func validateReference(_ ref: AST.Reference) throws { + if let recLevel = ref.recursionLevel { + throw error(.unsupported("recursion level"), at: recLevel.location) + } switch ref.kind { case .absolute(let i): guard i <= captures.captures.count else { throw error(.invalidReference(i), at: ref.innerLoc) } + case .named(let name): + guard captures.hasCapture(named: name) else { + throw error(.invalidNamedReference(name), at: ref.innerLoc) + } case .relative: throw error(.unsupported("relative capture reference"), at: ref.innerLoc) - case .named: - // TODO: This could be implemented by querying the capture list for an - // index. - throw error(.unsupported("named capture reference"), at: ref.innerLoc) - } - if let recLevel = ref.recursionLevel { - throw error(.unsupported("recursion level"), at: recLevel.location) } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index ccca1878d..3a91b6c67 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -5,6 +5,11 @@ extension Compiler { var options: MatchingOptions var builder = Program.Builder() + init(options: MatchingOptions, captureList: CaptureList) { + self.options = options + self.builder.captureList = captureList + } + mutating func finish( ) throws -> Program { builder.buildAccept() @@ -62,7 +67,9 @@ extension Compiler.ByteCodeGen { case .absolute(let i): // Backreferences number starting at 1 builder.buildBackreference(.init(i-1)) - case .relative, .named: + case .named(let name): + try builder.buildNamedReference(name) + case .relative: throw Unsupported("Backreference kind: \(ref)") } } diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 1c20761c8..f02d4959c 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -27,8 +27,9 @@ class Compiler { __consuming func emit() throws -> Program { // TODO: Handle global options - var codegen = ByteCodeGen(options: options) - codegen.builder.captureList = tree.root._captureList + var codegen = ByteCodeGen( + options: options, captureList: tree.root._captureList + ) try codegen.emitNode(tree.root) let program = try codegen.finish() return program diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 416583f7b..f706c0471 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -44,7 +44,7 @@ extension MEProgram where Input.Element: Hashable { // Symbolic reference resolution var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:] var referencedCaptureOffsets: [ReferenceID: Int] = [:] - var namedCaptureOffsets: [String: Int] = [:] + var captureCount: Int { // We currently deduce the capture count from the capture register number. nextCaptureRegister.rawValue @@ -284,6 +284,13 @@ extension MEProgram.Builder { unresolvedReferences[id, default: []].append(lastInstructionAddress) } + mutating func buildNamedReference(_ name: String) throws { + guard let index = captureList.indexOfCapture(named: name) else { + throw RegexCompilationError.uncapturedReference + } + buildBackreference(.init(index)) + } + // TODO: Mutating because of fail address fixup, drop when // that's removed mutating func assemble() throws -> MEProgram { @@ -359,7 +366,6 @@ extension MEProgram.Builder { registerInfo: regInfo, captureList: captureList, referencedCaptureOffsets: referencedCaptureOffsets, - namedCaptureOffsets: namedCaptureOffsets, initialOptions: initialOptions) } @@ -456,9 +462,10 @@ extension MEProgram.Builder { assert(preexistingValue == nil) } if let name = name { - // TODO: Reject duplicate capture names unless `(?J)`? - namedCaptureOffsets.updateValue(captureCount, forKey: name) + let index = captureList.indexOfCapture(named: name) + assert(index == nextCaptureRegister.rawValue) } + assert(nextCaptureRegister.rawValue < captureList.captures.count) return nextCaptureRegister } diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift index e3a542c1e..7003c0261 100644 --- a/Sources/_StringProcessing/Engine/MECapture.swift +++ b/Sources/_StringProcessing/Engine/MECapture.swift @@ -145,7 +145,6 @@ extension Processor._StoredCapture: CustomStringConvertible { struct MECaptureList { var values: Array._StoredCapture> var referencedCaptureOffsets: [ReferenceID: Int] - var namedCaptureOffsets: [String: Int] // func extract(from s: String) -> Array> { // caps.map { $0.map { s[$0] } } diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index 5c3010a75..8b4737e7a 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -36,7 +36,6 @@ struct MEProgram where Input.Element: Equatable { let captureList: CaptureList let referencedCaptureOffsets: [ReferenceID: Int] - let namedCaptureOffsets: [String: Int] var initialOptions: MatchingOptions } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index e44b110e5..abbdbd80a 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -37,8 +37,7 @@ struct Executor { let capList = MECaptureList( values: cpu.storedCaptures, - referencedCaptureOffsets: engine.program.referencedCaptureOffsets, - namedCaptureOffsets: engine.program.namedCaptureOffsets) + referencedCaptureOffsets: engine.program.referencedCaptureOffsets) let range = inputRange.lowerBound...)(?P=a1)"#, + input: "aaaaaaaaabbc", match: "aaaaaaaaabb") + firstMatchTest( #"(.)\g001"#, input: "112", match: "11") - firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true) - firstMatchTest(#"(?.)(.)\k"#, input: "abac", match: "aba", xfail: true) - firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true) + firstMatchTest(#"(?.)(.)\k"#, input: "abac", match: "aba") + + firstMatchTest(#"(?.)(?.)(?.)\k\k\k"#, + input: "xyzzxy", match: "xyzzxy") firstMatchTest(#"\1(.)"#, input: "112", match: nil) + firstMatchTest(#"\k(?.)"#, input: "112", match: nil) + + // TODO: Implement subpattern matching. + firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true) + firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true) } func testMatchExamples() { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 975d86b75..866dc2795 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1231,16 +1231,37 @@ extension RegexTests { parseTest(#"\k'-3'"#, backreference(.relative(-3)), throwsError: .unsupported) parseTest(#"\k'1'"#, backreference(.absolute(1)), throwsError: .invalid) - parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .unsupported) - parseTest(#"\k"#, backreference(.named("bc")), throwsError: .unsupported) - parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .unsupported) - parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .unsupported) + parseTest( + #"(?)\k"#, concat( + namedCapture("a", empty()), backreference(.named("a")) + ), captures: [.named("a")] + ) + parseTest( + #"(?)\k{a}"#, concat( + namedCapture("a", empty()), backreference(.named("a")) + ), captures: [.named("a")] + ) + parseTest( + #"(?)\g{a}"#, concat( + namedCapture("a", empty()), backreference(.named("a")) + ), captures: [.named("a")] + ) + parseTest( + #"(?)(?P=a)"#, concat( + namedCapture("a", empty()), backreference(.named("a")) + ), captures: [.named("a")] + ) + + parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .invalid) + parseTest(#"\k"#, backreference(.named("bc")), throwsError: .invalid) + parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .invalid) + parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .invalid) // Oniguruma recursion levels. parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0), throwsError: .unsupported) parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0), throwsError: .unsupported) - parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .invalid) - parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .invalid) + parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .unsupported) + parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .unsupported) parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8), throwsError: .unsupported) parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), throwsError: .unsupported) parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8), throwsError: .unsupported) @@ -2137,7 +2158,7 @@ extension RegexTests { throwsError: .unsupported ) parseWithDelimitersTest( - #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .unsupported) + #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalid) parseWithDelimitersTest( #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1), throwsError: .unsupported @@ -2774,6 +2795,12 @@ extension RegexTests { diagnosticTest(#"(?:)()\2"#, .invalidReference(2)) diagnosticTest(#"(?:)(?:)\2"#, .invalidReference(2)) + diagnosticTest(#"\k"#, .invalidNamedReference("a")) + diagnosticTest(#"(?:)\k"#, .invalidNamedReference("a")) + diagnosticTest(#"()\k"#, .invalidNamedReference("a")) + diagnosticTest(#"()\k()"#, .invalidNamedReference("a")) + diagnosticTest(#"(?)\k()"#, .invalidNamedReference("a")) + // MARK: Conditionals diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3))