diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 66fefc49e..ce965de68 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -84,7 +84,8 @@ fileprivate extension Compiler.ByteCodeGen { try emitBackreference(ref.ast) case let .symbolicReference(id): - builder.buildUnresolvedReference(id: id) + builder.buildUnresolvedReference( + id: id, isScalarMode: options.semanticLevel == .unicodeScalar) case let .changeMatchingOptions(optionSequence): if !hasEmittedFirstMatchableAtom { @@ -143,9 +144,11 @@ fileprivate extension Compiler.ByteCodeGen { guard let i = n.value else { throw Unreachable("Expected a value") } - builder.buildBackreference(.init(i)) + builder.buildBackreference( + .init(i), isScalarMode: options.semanticLevel == .unicodeScalar) case .named(let name): - try builder.buildNamedReference(name) + try builder.buildNamedReference( + name, isScalarMode: options.semanticLevel == .unicodeScalar) case .relative: throw Unsupported("Backreference kind: \(ref)") } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 1e2ed757b..3d618e416 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -240,6 +240,13 @@ extension Instruction.Payload { interpret() } + init(capture: CaptureRegister, isScalarMode: Bool) { + self.init(isScalarMode ? 1 : 0, capture) + } + var captureAndMode: (isScalarMode: Bool, CaptureRegister) { + let pair: (UInt64, CaptureRegister) = interpretPair() + return (pair.0 == 1, pair.1) + } init(capture: CaptureRegister) { self.init(capture) } @@ -247,7 +254,6 @@ extension Instruction.Payload { interpret() } - // MARK: Packed operand payloads init(immediate: UInt64, int: IntRegister) { diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 959b1507e..7ecfb5b70 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -292,22 +292,23 @@ extension MEProgram.Builder { } mutating func buildBackreference( - _ cap: CaptureRegister + _ cap: CaptureRegister, + isScalarMode: Bool ) { instructions.append( - .init(.backreference, .init(capture: cap))) + .init(.backreference, .init(capture: cap, isScalarMode: isScalarMode))) } - mutating func buildUnresolvedReference(id: ReferenceID) { - buildBackreference(.init(0)) + mutating func buildUnresolvedReference(id: ReferenceID, isScalarMode: Bool) { + buildBackreference(.init(0), isScalarMode: isScalarMode) unresolvedReferences[id, default: []].append(lastInstructionAddress) } - mutating func buildNamedReference(_ name: String) throws { + mutating func buildNamedReference(_ name: String, isScalarMode: Bool) throws { guard let index = captureList.indexOfCapture(named: name) else { throw RegexCompilationError.uncapturedReference } - buildBackreference(.init(index)) + buildBackreference(.init(index), isScalarMode: isScalarMode) } // TODO: Mutating because of fail address fixup, drop when @@ -456,8 +457,10 @@ fileprivate extension MEProgram.Builder { throw RegexCompilationError.uncapturedReference } for use in uses { + let (isScalarMode, _) = instructions[use.rawValue].payload.captureAndMode instructions[use.rawValue] = - Instruction(.backreference, .init(capture: .init(offset))) + Instruction(.backreference, + .init(capture: .init(offset), isScalarMode: isScalarMode)) } } } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index a62c1e070..9a234e356 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -231,9 +231,17 @@ extension Processor { // Match against the current input prefix. Returns whether // it succeeded vs signaling an error. - mutating func matchSeq( - _ seq: C - ) -> Bool where C.Element == Input.Element { + mutating func matchSeq( + _ seq: Substring, + isScalarMode: Bool + ) -> Bool { + if isScalarMode { + for s in seq.unicodeScalars { + guard matchScalar(s, boundaryCheck: false) else { return false } + } + return true + } + for e in seq { guard match(e) else { return false } } @@ -584,8 +592,9 @@ extension Processor { } case .backreference: + let (isScalarMode, capture) = payload.captureAndMode let capNum = Int( - asserting: payload.capture.rawValue) + asserting: capture.rawValue) guard capNum < storedCaptures.count else { fatalError("Should this be an assert?") } @@ -597,7 +606,7 @@ extension Processor { signalFailure() return } - if matchSeq(input[range]) { + if matchSeq(input[range], isScalarMode: isScalarMode) { controller.step() } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 794e57b16..e86352285 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1643,6 +1643,11 @@ extension RegexTests { (input: "123x23", match: "23x23"), xfail: true) + // Backreferences in scalar mode + // In scalar mode the backreference should not match + firstMatchTest(#"(.+)\1"#, input: "ée\u{301}", match: "ée\u{301}") + firstMatchTest(#"(.+)\1"#, input: "ée\u{301}", match: nil, semanticLevel: .unicodeScalar) + // Backreferences in lookaheads firstMatchTests( #"^(?=.*(.)(.)\2\1).+$"#,