diff --git a/Documentation/Evolution/RegexTypeOverview.md b/Documentation/Evolution/RegexTypeOverview.md index adb6640ad..bce336551 100644 --- a/Documentation/Evolution/RegexTypeOverview.md +++ b/Documentation/Evolution/RegexTypeOverview.md @@ -1,4 +1,3 @@ - # Regex Type and Overview - Authors: [Michael Ilseman](https://github.com/milseman) @@ -225,7 +224,7 @@ func processEntry(_ line: String) -> Transaction? { The result builder allows for inline failable value construction, which participates in the overall string processing algorithm: returning `nil` signals a local failure and the engine backtracks to try an alternative. This not only relieves the use site from post-processing, it enables new kinds of processing algorithms, allows for search-space pruning, and enhances debuggability. -Swift regexes describe an unambiguous algorithm, were choice is ordered and effects can be reliably observed. For example, a `print()` statement inside the `TryCapture`'s transform function will run whenever the overall algorithm naturally dictates an attempt should be made. Optimizations can only elide such calls if they can prove it is behavior-preserving (e.g. "pure"). +Swift regexes describe an unambiguous algorithm, where choice is ordered and effects can be reliably observed. For example, a `print()` statement inside the `TryCapture`'s transform function will run whenever the overall algorithm naturally dictates an attempt should be made. Optimizations can only elide such calls if they can prove it is behavior-preserving (e.g. "pure"). `CustomMatchingRegexComponent`, discussed in [String Processing Algorithms][pitches], allows industrial-strength parsers to be used a regex components. This allows us to drop the overly-permissive pre-parsing step: @@ -278,14 +277,14 @@ func processEntry(_ line: String) -> Transaction? { *Note*: Details on how references work is discussed in [Regex Builders][pitches]. `Regex.Match` supports referring to _all_ captures by position (`match.1`, etc.) whether named or referenced or neither. Due to compiler limitations, result builders do not support forming labeled tuples for named captures. -### Algorithms, algorithms everywhere +### Regex-powered algorithms Regexes can be used right out of the box with a variety of powerful and convenient algorithms, including trimming, splitting, and finding/replacing all matches within a string. These algorithms are discussed in [String Processing Algorithms][pitches]. -### Onward Unicode +### Unicode handling A regex describes an algorithm to be ran over some model of string, and Swift's `String` has a rather unique Unicode-forward model. `Character` is an [extended grapheme cluster](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) and equality is determined under [canonical equivalence](https://www.unicode.org/reports/tr15/#Canon_Compat_Equivalence). @@ -310,12 +309,12 @@ public struct Regex { /// Match a string in its entirety. /// /// Returns `nil` if no match and throws on abort - public func matchWhole(_ s: String) throws -> Regex.Match? + public func wholeMatch(in s: String) throws -> Regex.Match? /// Match part of the string, starting at the beginning. /// /// Returns `nil` if no match and throws on abort - public func matchPrefix(_ s: String) throws -> Regex.Match? + public func prefixMatch(in s: String) throws -> Regex.Match? /// Find the first match in a string /// @@ -325,17 +324,17 @@ public struct Regex { /// Match a substring in its entirety. /// /// Returns `nil` if no match and throws on abort - public func matchWhole(_ s: Substring) throws -> Regex.Match? + public func wholeMatch(in s: Substring) throws -> Regex.Match? /// Match part of the string, starting at the beginning. /// /// Returns `nil` if no match and throws on abort - public func matchPrefix(_ s: Substring) throws -> Regex.Match? + public func prefixMatch(in s: Substring) throws -> Regex.Match? /// Find the first match in a substring /// /// Returns `nil` if no match is found and throws on abort - public func firstMatch(_ s: Substring) throws -> Regex.Match? + public func firstMatch(in s: Substring) throws -> Regex.Match? /// The result of matching a regex against a string. /// @@ -344,19 +343,19 @@ public struct Regex { @dynamicMemberLookup public struct Match { /// The range of the overall match - public let range: Range + public var range: Range { get } /// The produced output from the match operation - public var output: Output + public var output: Output { get } /// Lookup a capture by name or number - public subscript(dynamicMember keyPath: KeyPath) -> T + public subscript(dynamicMember keyPath: KeyPath) -> T { get } /// Lookup a capture by number @_disfavoredOverload public subscript( dynamicMember keyPath: KeyPath<(Output, _doNotUse: ()), Output> - ) -> Output + ) -> Output { get } // Note: this allows `.0` when `Match` is not a tuple. } @@ -482,6 +481,7 @@ We're also looking for more community discussion on what the default type system The actual `Match` struct just stores ranges: the `Substrings` are lazily created on demand. This avoids unnecessary ARC traffic and memory usage. + ### `Regex` instead of `Regex` The generic parameter `Output` is proposed to contain both the whole match (the `.0` element if `Output` is a tuple) and captures. One alternative we have considered is separating `Output` into the entire match and the captures, i.e. `Regex`, and using `Void` for for `Captures` when there are no captures. diff --git a/Sources/Exercises/Participants/RegexParticipant.swift b/Sources/Exercises/Participants/RegexParticipant.swift index 71018d8a7..6fddc0914 100644 --- a/Sources/Exercises/Participants/RegexParticipant.swift +++ b/Sources/Exercises/Participants/RegexParticipant.swift @@ -63,7 +63,7 @@ private func graphemeBreakPropertyData( forLine line: String, using regex: RP ) -> GraphemeBreakEntry? where RP.Output == (Substring, Substring, Substring?, Substring) { - line.matchWhole(regex).map(\.output).flatMap(extractFromCaptures) + line.wholeMatch(of: regex).map(\.output).flatMap(extractFromCaptures) } private func graphemeBreakPropertyDataLiteral( @@ -80,7 +80,7 @@ private func graphemeBreakPropertyDataLiteral( private func graphemeBreakPropertyData( forLine line: String ) -> GraphemeBreakEntry? { - line.matchWhole { + line.wholeMatch { TryCapture(OneOrMore(.hexDigit)) { Unicode.Scalar(hex: $0) } Optionally { ".." diff --git a/Sources/RegexBuilder/Match.swift b/Sources/RegexBuilder/Match.swift index ac07ec0b8..e6718d96b 100644 --- a/Sources/RegexBuilder/Match.swift +++ b/Sources/RegexBuilder/Match.swift @@ -12,29 +12,29 @@ import _StringProcessing extension String { - public func matchWhole( - @RegexComponentBuilder _ content: () -> R + public func wholeMatch( + @RegexComponentBuilder of content: () -> R ) -> Regex.Match? { - matchWhole(content()) + wholeMatch(of: content()) } - public func matchPrefix( - @RegexComponentBuilder _ content: () -> R + public func prefixMatch( + @RegexComponentBuilder of content: () -> R ) -> Regex.Match? { - matchPrefix(content()) + prefixMatch(of: content()) } } extension Substring { - public func matchWhole( - @RegexComponentBuilder _ content: () -> R + public func wholeMatch( + @RegexComponentBuilder of content: () -> R ) -> Regex.Match? { - matchWhole(content()) + wholeMatch(of: content()) } - public func matchPrefix( - @RegexComponentBuilder _ content: () -> R + public func prefixMatch( + @RegexComponentBuilder of content: () -> R ) -> Regex.Match? { - matchPrefix(content()) + prefixMatch(of: content()) } } diff --git a/Sources/_StringProcessing/Algorithms/Consumers/RegexConsumer.swift b/Sources/_StringProcessing/Algorithms/Consumers/RegexConsumer.swift index dcc5d9f2b..2718d520a 100644 --- a/Sources/_StringProcessing/Algorithms/Consumers/RegexConsumer.swift +++ b/Sources/_StringProcessing/Algorithms/Consumers/RegexConsumer.swift @@ -9,6 +9,7 @@ // //===----------------------------------------------------------------------===// +// FIXME: What even is this? Can we delete this whole thing? struct RegexConsumer< R: RegexComponent, Consumed: BidirectionalCollection > where Consumed.SubSequence == Substring { @@ -24,7 +25,7 @@ extension RegexConsumer { func _matchingConsuming( _ consumed: Substring, in range: Range ) -> (upperBound: String.Index, match: Match)? { - guard let result = try! regex._match( + guard let result = try! regex.regex._match( consumed.base, in: range, mode: .partialFromFront ) else { return nil } diff --git a/Sources/_StringProcessing/Algorithms/Matching/FirstMatch.swift b/Sources/_StringProcessing/Algorithms/Matching/FirstMatch.swift index 2aea0f342..1ad555e7d 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/FirstMatch.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/FirstMatch.swift @@ -55,9 +55,9 @@ extension BidirectionalCollection where SubSequence == Substring { /// - Returns: The first match of `regex` in the collection, or `nil` if /// there isn't a match. public func firstMatch( - of regex: R + of r: R ) -> Regex.Match? { let slice = self[...] - return try? regex.firstMatch(in: slice.base) + return try? r.regex.firstMatch(in: slice.base) } } diff --git a/Sources/_StringProcessing/Algorithms/Matching/Matches.swift b/Sources/_StringProcessing/Algorithms/Matching/Matches.swift index 6453fccb0..24cdbee0f 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/Matches.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/Matches.swift @@ -202,14 +202,17 @@ extension BidirectionalCollection where SubSequence == Substring { // FIXME: Replace the returned value as `some Collection.Match> // when SE-0346 is enabled - func _matches(of regex: R) -> [Regex.Match] { + func _matches(of r: R) -> [Regex.Match] { let slice = self[...] var start = self.startIndex let end = self.endIndex + let regex = r.regex var result = [Regex.Match]() while start < end { - guard let match = try? regex._firstMatch(slice.base, in: start.. Regex.Match? { + public func wholeMatch(in s: String) throws -> Regex.Match? { try _match(s, in: s.startIndex.. Regex.Match? { + public func prefixMatch(in s: String) throws -> Regex.Match? { try _match(s, in: s.startIndex.. Regex.Match? { + public func wholeMatch(in s: Substring) throws -> Regex.Match? { try _match(s.base, in: s.startIndex.. Regex.Match? { + public func prefixMatch(in s: Substring) throws -> Regex.Match? { try _match(s.base, in: s.startIndex..(_ regex: R) -> Regex.Match? { - try? regex.matchWhole(self) + public func wholeMatch( + of r: R + ) -> Regex.Match? { + try? r.regex.wholeMatch(in: self) } - public func matchPrefix(_ regex: R) -> Regex.Match? { - try? regex.matchPrefix(self) + public func prefixMatch( + of r: R + ) -> Regex.Match? { + try? r.regex.prefixMatch(in: self) } } extension Substring { - public func matchWhole(_ regex: R) -> Regex.Match? { - try? regex.matchWhole(self) - } - public func matchPrefix(_ regex: R) -> Regex.Match? { - try? regex.matchPrefix(self) + public func wholeMatch( + of r: R + ) -> Regex.Match? { + try? r.regex.wholeMatch(in: self) + } + public func prefixMatch( + of r: R + ) -> Regex.Match? { + try? r.regex.prefixMatch(in: self) } } diff --git a/Tests/RegexBuilderTests/CustomTests.swift b/Tests/RegexBuilderTests/CustomTests.swift index 79ebc2693..555ecb8ca 100644 --- a/Tests/RegexBuilderTests/CustomTests.swift +++ b/Tests/RegexBuilderTests/CustomTests.swift @@ -62,7 +62,7 @@ func customTest( let result: Match? switch call { case .match: - result = input.matchWhole(regex)?.output + result = input.wholeMatch(of: regex)?.output case .firstMatch: result = input.firstMatch(of: regex)?.output } @@ -167,7 +167,7 @@ class CustomRegexComponentTests: XCTestCase { // TODO: Why is Radix optional? do { - guard let m = try hexRegex.matchWhole("123aef.345") else { + guard let m = try hexRegex.wholeMatch(in: "123aef.345") else { XCTFail() return } @@ -180,7 +180,7 @@ class CustomRegexComponentTests: XCTestCase { } do { - _ = try hexRegex.matchWhole("123aef❗️345") + _ = try hexRegex.wholeMatch(in: "123aef❗️345") XCTFail() } catch let e as Abort { XCTAssertEqual(e, Abort()) @@ -202,7 +202,7 @@ class CustomRegexComponentTests: XCTestCase { } do { - guard let m = try addressRegex.matchWhole("0x1234567f") else { + guard let m = try addressRegex.wholeMatch(in: "0x1234567f") else { XCTFail() return } @@ -213,7 +213,7 @@ class CustomRegexComponentTests: XCTestCase { } do { - _ = try addressRegex.matchWhole("0xdeadbeef") + _ = try addressRegex.wholeMatch(in: "0xdeadbeef") XCTFail() } catch let e as Poison { XCTAssertEqual(e, Poison()) diff --git a/Tests/RegexBuilderTests/MotivationTests.swift b/Tests/RegexBuilderTests/MotivationTests.swift index 882ba6448..1927b9ae4 100644 --- a/Tests/RegexBuilderTests/MotivationTests.swift +++ b/Tests/RegexBuilderTests/MotivationTests.swift @@ -199,7 +199,7 @@ private func process( _ line: String, using regex: Regex<(Substring, Substring, Substring, Substring, Substring)> ) -> Transaction? { - guard let output = try? regex.matchWhole(line), + guard let output = try? regex.wholeMatch(in: line), let kind = Transaction.Kind(output.1) else { return nil diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 1f5305c59..0c0bf7c8f 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -24,7 +24,7 @@ class RegexDSLTests: XCTestCase { ) throws { let regex = content() for (input, maybeExpectedCaptures) in tests { - let maybeMatch = input.matchWhole(regex) + let maybeMatch = input.wholeMatch(of: regex) if let expectedCaptures = maybeExpectedCaptures { let match = try XCTUnwrap(maybeMatch, file: file, line: line) XCTAssertTrue( @@ -52,12 +52,12 @@ class RegexDSLTests: XCTestCase { } // Assert the inferred capture type. let _: (Substring, Substring, Int).Type = type(of: regex).Output.self - let maybeMatch = "ab1".matchWhole(regex) + let maybeMatch = "ab1".wholeMatch(of: regex) let match = try XCTUnwrap(maybeMatch) XCTAssertTrue(match.output == ("ab1", "b", 1)) let substring = "ab1"[...] - let substringMatch = try XCTUnwrap(substring.matchWhole(regex)) + let substringMatch = try XCTUnwrap(substring.wholeMatch(of: regex)) XCTAssertTrue(match.output == substringMatch.output) } @@ -126,7 +126,7 @@ class RegexDSLTests: XCTestCase { } func testMatchResultDotZeroWithoutCapture() throws { - let match = try XCTUnwrap("aaa".matchWhole { OneOrMore { "a" } }) + let match = try XCTUnwrap("aaa".wholeMatch { OneOrMore { "a" } }) XCTAssertEqual(match.0, "aaa") } @@ -135,8 +135,8 @@ class RegexDSLTests: XCTestCase { let regex = ChoiceOf { "aaa" } - XCTAssertTrue("aaa".matchWhole(regex)?.output == "aaa") - XCTAssertNil("aab".matchWhole(regex)?.output) + XCTAssertTrue("aaa".wholeMatch(of: regex)?.output == "aaa") + XCTAssertNil("aab".wholeMatch(of: regex)?.output) } do { let regex = ChoiceOf { @@ -144,10 +144,10 @@ class RegexDSLTests: XCTestCase { "bbb" "ccc" } - XCTAssertTrue("aaa".matchWhole(regex)?.output == "aaa") - XCTAssertNil("aab".matchWhole(regex)?.output) - XCTAssertTrue("bbb".matchWhole(regex)?.output == "bbb") - XCTAssertTrue("ccc".matchWhole(regex)?.output == "ccc") + XCTAssertTrue("aaa".wholeMatch(of: regex)?.output == "aaa") + XCTAssertNil("aab".wholeMatch(of: regex)?.output) + XCTAssertTrue("bbb".wholeMatch(of: regex)?.output == "bbb") + XCTAssertTrue("ccc".wholeMatch(of: regex)?.output == "ccc") } do { let regex = Regex { @@ -162,7 +162,7 @@ class RegexDSLTests: XCTestCase { } } XCTAssertTrue( - try XCTUnwrap("abc".matchWhole(regex)?.output) == ("abc", "c")) + try XCTUnwrap("abc".wholeMatch(of: regex)?.output) == ("abc", "c")) } do { let regex = ChoiceOf { @@ -170,18 +170,18 @@ class RegexDSLTests: XCTestCase { "bbb" "ccc" } - XCTAssertTrue("aaa".matchWhole(regex)?.output == "aaa") - XCTAssertNil("aab".matchWhole(regex)?.output) - XCTAssertTrue("bbb".matchWhole(regex)?.output == "bbb") - XCTAssertTrue("ccc".matchWhole(regex)?.output == "ccc") + XCTAssertTrue("aaa".wholeMatch(of: regex)?.output == "aaa") + XCTAssertNil("aab".wholeMatch(of: regex)?.output) + XCTAssertTrue("bbb".wholeMatch(of: regex)?.output == "bbb") + XCTAssertTrue("ccc".wholeMatch(of: regex)?.output == "ccc") } do { let regex = ChoiceOf { Capture("aaa") } XCTAssertTrue( - try XCTUnwrap("aaa".matchWhole(regex)?.output) == ("aaa", "aaa")) - XCTAssertNil("aab".matchWhole(regex)?.output) + try XCTUnwrap("aaa".wholeMatch(of: regex)?.output) == ("aaa", "aaa")) + XCTAssertNil("aab".wholeMatch(of: regex)?.output) } do { let regex = ChoiceOf { @@ -190,12 +190,12 @@ class RegexDSLTests: XCTestCase { Capture("ccc") } XCTAssertTrue( - try XCTUnwrap("aaa".matchWhole(regex)?.output) == ("aaa", "aaa", nil, nil)) + try XCTUnwrap("aaa".wholeMatch(of: regex)?.output) == ("aaa", "aaa", nil, nil)) XCTAssertTrue( - try XCTUnwrap("bbb".matchWhole(regex)?.output) == ("bbb", nil, "bbb", nil)) + try XCTUnwrap("bbb".wholeMatch(of: regex)?.output) == ("bbb", nil, "bbb", nil)) XCTAssertTrue( - try XCTUnwrap("ccc".matchWhole(regex)?.output) == ("ccc", nil, nil, "ccc")) - XCTAssertNil("aab".matchWhole(regex)?.output) + try XCTUnwrap("ccc".wholeMatch(of: regex)?.output) == ("ccc", nil, nil, "ccc")) + XCTAssertNil("aab".wholeMatch(of: regex)?.output) } } @@ -407,7 +407,7 @@ class RegexDSLTests: XCTestCase { // Assert the inferred capture type. let _: Substring.Type = type(of: regex).Output.self let input = "123123" - let match = try XCTUnwrap(input.matchWhole(regex)?.output) + let match = try XCTUnwrap(input.wholeMatch(of: regex)?.output) XCTAssertTrue(match == input) } @@ -534,7 +534,7 @@ class RegexDSLTests: XCTestCase { let unicodeLine = "1BCA0..1BCA3 ; Control # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP" - let match = try XCTUnwrap(unicodeLine.matchWhole(unicodeData)) + let match = try XCTUnwrap(unicodeLine.wholeMatch(of: unicodeData)) XCTAssertEqual(match.0, Substring(unicodeLine)) XCTAssertEqual(match.1, "Control") } @@ -566,7 +566,7 @@ class RegexDSLTests: XCTestCase { Substring, Unicode.Scalar?, Unicode.Scalar??, Substring ) let _: ExpectedMatch.Type = type(of: regexWithCapture).Output.self - let maybeMatchResult = line.matchWhole(regexWithCapture) + let maybeMatchResult = line.wholeMatch(of: regexWithCapture) let matchResult = try XCTUnwrap(maybeMatchResult) let (wholeMatch, lower, upper, propertyString) = matchResult.output XCTAssertEqual(wholeMatch, Substring(line)) @@ -601,7 +601,7 @@ class RegexDSLTests: XCTestCase { Substring, Unicode.Scalar, Unicode.Scalar?, Substring ) let _: ExpectedMatch.Type = type(of: regexWithTryCapture).Output.self - let maybeMatchResult = line.matchWhole(regexWithTryCapture) + let maybeMatchResult = line.wholeMatch(of: regexWithTryCapture) let matchResult = try XCTUnwrap(maybeMatchResult) let (wholeMatch, lower, upper, propertyString) = matchResult.output XCTAssertEqual(wholeMatch, Substring(line)) @@ -614,7 +614,7 @@ class RegexDSLTests: XCTestCase { let regexLiteral = try Regex( compiling: #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#, as: (Substring, Substring, Substring?, Substring).self) - let maybeMatchResult = line.matchWhole(regexLiteral) + let maybeMatchResult = line.wholeMatch(of: regexLiteral) let matchResult = try XCTUnwrap(maybeMatchResult) let (wholeMatch, lower, upper, propertyString) = matchResult.output XCTAssertEqual(wholeMatch, Substring(line)) @@ -628,7 +628,7 @@ class RegexDSLTests: XCTestCase { do { let regex = try Regex(compiling: "aabcc.") let line = "aabccd" - let match = try XCTUnwrap(line.matchWhole(regex)) + let match = try XCTUnwrap(line.wholeMatch(of: regex)) XCTAssertEqual(match.0, line[...]) let output = match.output XCTAssertEqual(output[0].substring, line[...]) @@ -640,7 +640,7 @@ class RegexDSLTests: XCTestCase { A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM \ COMBINING MARK TUKWENTIS """ - let match = try XCTUnwrap(line.matchWhole(regex)) + let match = try XCTUnwrap(line.wholeMatch(of: regex)) XCTAssertEqual(match.0, line[...]) let output = match.output XCTAssertEqual(output[0].substring, line[...]) @@ -705,7 +705,7 @@ class RegexDSLTests: XCTestCase { } } let input = "abc#41#42abc#42#42" - let result = try XCTUnwrap(input.matchWhole(regex)) + let result = try XCTUnwrap(input.wholeMatch(of: regex)) XCTAssertEqual(result[a], "abc") XCTAssertEqual(result[b], 42) } @@ -785,7 +785,7 @@ class RegexDSLTests: XCTestCase { let parser = SemanticVersionParser() for (str, version) in versions { - XCTAssertEqual(str.matchWhole(parser)?.output, version) + XCTAssertEqual(str.wholeMatch(of: parser)?.output, version) } } }