Skip to content

Commit 0bee970

Browse files
committed
Address some literal pattern printing issues
This fixes some issues, all regex match tests now succeed with the round-tripped regex, when that regex is supported.
1 parent 9828bf3 commit 0bee970

File tree

5 files changed

+141
-51
lines changed

5 files changed

+141
-51
lines changed

Sources/_StringProcessing/LiteralPrinter.swift

Lines changed: 76 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ extension LiteralPrinter {
118118
case let .atom(atom):
119119
outputAtom(atom)
120120
case let .quotedLiteral(literal):
121-
outputQuotedLiteral(literal)
121+
output(prepareQuotedLiteral(literal))
122122

123123
case .trivia(_):
124124
// TODO: Include trivia?
@@ -157,13 +157,40 @@ extension LiteralPrinter {
157157
output(")")
158158
}
159159

160+
func requiresGrouping(_ node: DSLTree.Node) -> Bool {
161+
switch node {
162+
case .concatenation(let children):
163+
switch children.count {
164+
case 0:
165+
return false
166+
case 1:
167+
return requiresGrouping(children.first!)
168+
default:
169+
return true
170+
}
171+
172+
case .quotedLiteral(let literal):
173+
return prepareQuotedLiteral(literal).count > 1
174+
175+
default:
176+
return false
177+
}
178+
}
179+
160180
mutating func outputQuantification(
161181
_ amount: DSLTree._AST.QuantificationAmount,
162182
_ kind: DSLTree.QuantificationKind,
163183
_ child: DSLTree.Node
164184
) {
165-
outputNode(child)
166-
185+
// RegexBuilder regexes can have children that need
186+
if requiresGrouping(child) {
187+
output("(?:")
188+
outputNode(child)
189+
output(")")
190+
} else {
191+
outputNode(child)
192+
}
193+
167194
switch amount.ast {
168195
case .zeroOrMore:
169196
output("*")
@@ -254,7 +281,7 @@ extension LiteralPrinter {
254281
mutating func outputAtom(_ atom: DSLTree.Atom) {
255282
switch atom {
256283
case .char(let char):
257-
output(char.escapingConfusable)
284+
output(char.escapingForLiteral)
258285
case .scalar(let scalar):
259286
output(scalar.escapedString)
260287
case .any:
@@ -284,11 +311,12 @@ extension LiteralPrinter {
284311
case .symbolicReference(_):
285312
// RegexBuilder only
286313
saveInconvertible(.atom(atom))
287-
case .changeMatchingOptions(let options):
288-
output(options.ast._patternString)
314+
case .changeMatchingOptions(let optionSequence):
315+
output(optionSequence.ast._patternString)
289316
output(")")
317+
options.apply(optionSequence.ast)
290318
case .unconverted(let atom):
291-
output(atom.ast._dumpBase)
319+
outputUnconvertedAST(atom.ast)
292320
}
293321
}
294322

@@ -316,12 +344,11 @@ extension LiteralPrinter {
316344
}
317345
}
318346

319-
mutating func outputQuotedLiteral(_ literal: String) {
320-
// TODO: Look for confusable characters
321-
if literal.containsRegexMetaCharacters {
322-
output(#"\Q\#(literal)\E"#)
347+
func prepareQuotedLiteral(_ literal: String) -> String {
348+
if options.usesExtendedWhitespace || literal.containsRegexMetaCharacters {
349+
return #"\Q\#(literal)\E"#
323350
} else {
324-
output(literal.escapingConfusableCharacters())
351+
return literal.escapingConfusableCharacters()
325352
}
326353
}
327354

@@ -350,7 +377,7 @@ extension LiteralPrinter {
350377
case let .custom(charClass):
351378
outputCustomCharacterClass(charClass)
352379
case let .quotedLiteral(literal):
353-
if literal.containsRegexMetaCharacters {
380+
if options.usesExtendedWhitespace || literal.containsRegexMetaCharacters {
354381
output(#"\Q\#(literal)\E"#)
355382
} else {
356383
output(literal)
@@ -374,14 +401,34 @@ extension LiteralPrinter {
374401
}
375402
output("]")
376403
}
404+
405+
mutating func outputUnconvertedAST(_ ast: AST.Atom) {
406+
switch ast.kind {
407+
case let .property(property):
408+
if let base = property._regexBase {
409+
output(base)
410+
} else {
411+
saveInconvertible(.atom(.unconverted(.init(ast: ast))))
412+
}
413+
case let .namedCharacter(name):
414+
output("\\N{\(name)}")
415+
default:
416+
saveInconvertible(.atom(.unconverted(.init(ast: ast))))
417+
}
418+
}
377419
}
378420

379421
// MARK: - Supporting extensions
380422

381423
fileprivate let metachars = Set(#"\[](){}|+*?^$.-"#)
424+
382425
extension String {
383426
var containsRegexMetaCharacters: Bool {
384-
contains { metachars.contains($0) }
427+
contains(where: \.isRegexMetaCharacter)
428+
}
429+
430+
func escapingConfusableCharacters() -> String {
431+
lazy.map(\.escapingConfusable).joined()
385432
}
386433
}
387434

@@ -392,19 +439,25 @@ extension UnicodeScalar {
392439
}
393440

394441
extension Character {
442+
var isRegexMetaCharacter: Bool {
443+
metachars.contains(self)
444+
}
445+
395446
var escapingConfusable: String {
396447
if isConfusable {
397-
String(unicodeScalars.first!) +
448+
return String(unicodeScalars.first!) +
398449
unicodeScalars.dropFirst().lazy.map(\.escapedString).joined()
399450
} else {
400-
String(self)
451+
return String(self)
401452
}
402453
}
403-
}
404-
405-
extension String {
406-
func escapingConfusableCharacters() -> String {
407-
lazy.map(\.escapingConfusable).joined()
454+
455+
var escapingForLiteral: String {
456+
if isRegexMetaCharacter {
457+
return "\\\(self)"
458+
} else {
459+
return escapingConfusable
460+
}
408461
}
409462
}
410463

@@ -493,11 +546,10 @@ extension AST.MatchingOptionSequence {
493546

494547
if resetsCurrentOptions {
495548
assert(removing.isEmpty)
496-
return "(?^\(adding):"
549+
return "(?^\(adding)"
497550
} else {
498551
return "(?\(adding)"
499552
+ (removing.isEmpty ? "" : "-\(removing)")
500-
+ ":"
501553
}
502554
}
503555
}
@@ -521,7 +573,7 @@ extension DSLTree._AST.GroupKind {
521573
case .atomicScriptRun: return "(*asr:"
522574

523575
case let .changeMatchingOptions(sequence):
524-
return sequence._patternString
576+
return sequence._patternString + ":"
525577
}
526578
}
527579
}

Sources/_StringProcessing/MatchingOptions.swift

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,11 @@ extension MatchingOptions {
110110
!stack.last!.contains(.unicodeWordBoundaries)
111111
}
112112

113+
var usesExtendedWhitespace: Bool {
114+
stack.last!.contains(.extended)
115+
|| stack.last!.contains(.extraExtended)
116+
}
117+
113118
enum SemanticLevel {
114119
case graphemeCluster
115120
case unicodeScalar
@@ -160,6 +165,10 @@ extension MatchingOptions {
160165

161166
// Swift-only default possessive quantifier
162167
case possessiveByDefault
168+
169+
// Whitespace options
170+
case extended
171+
case extraExtended
163172

164173
init?(_ astKind: AST.MatchingOption.Kind) {
165174
switch astKind {
@@ -197,10 +206,10 @@ extension MatchingOptions {
197206
self = .byteSemantics
198207
case .possessiveByDefault:
199208
self = .possessiveByDefault
200-
201-
// Whitespace options are only relevant during parsing, not compilation.
202-
case .extended, .extraExtended:
203-
return nil
209+
case .extended:
210+
self = .extended
211+
case .extraExtended:
212+
self = .extraExtended
204213
}
205214
}
206215

Sources/_StringProcessing/PrintAsPattern.swift

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -847,7 +847,7 @@ extension AST.Atom.CharacterProperty {
847847
// TODO: Some way to integrate this with conversion...
848848
var _patternBase: String {
849849
if isUnprintableProperty {
850-
return _regexBase
850+
return _regexBase ?? " // TODO: Property \(self)"
851851
}
852852

853853
return _dslBase
@@ -886,25 +886,19 @@ extension AST.Atom.CharacterProperty {
886886
}
887887
}
888888

889-
var _regexBase: String {
889+
var _regexBase: String? {
890+
let prefix = isInverted ? "\\P" : "\\p"
890891
switch kind {
891892
case .ascii:
892893
return "[:\(isInverted ? "^" : "")ascii:]"
893894

894-
case .binary(let b, value: _):
895-
if isInverted {
896-
return "[^\\p{\(b.rawValue)}]"
897-
} else {
898-
return "\\p{\(b.rawValue)}"
899-
}
895+
case .binary(let b, value: let value):
896+
let suffix = value ? "" : "=false"
897+
return "\(prefix){\(b.rawValue)\(suffix)}"
900898

901899
case .generalCategory(let gc):
902-
if isInverted {
903-
return "[^\\p{\(gc.rawValue)}]"
904-
} else {
905-
return "\\p{\(gc.rawValue)}"
906-
}
907-
900+
return "\(prefix){\(gc.rawValue)}"
901+
908902
case .posix(let p):
909903
return "[:\(isInverted ? "^" : "")\(p.rawValue):]"
910904

@@ -914,8 +908,16 @@ extension AST.Atom.CharacterProperty {
914908
case .scriptExtension(let s):
915909
return "[:\(isInverted ? "^" : "")scx=\(s.rawValue):]"
916910

911+
case .any:
912+
return "\(prefix){Any}"
913+
case .assigned:
914+
return "\(prefix){Assigned}"
915+
916+
case .named(let name):
917+
return "\\N{\(name)}"
918+
917919
default:
918-
return " // TODO: Property \(self)"
920+
return nil
919921
}
920922
}
921923
}
@@ -1066,7 +1068,7 @@ extension AST.Atom {
10661068
return "<#value#>"
10671069

10681070
case let .property(p):
1069-
return p._regexBase
1071+
return p._regexBase ?? " // TODO: Property \(p)"
10701072

10711073
case let .escaped(e):
10721074
return "\\\(e.character)"

Tests/RegexTests/LiteralPrinterTests.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,18 @@ extension RegexTests {
2727

2828
func testPrintableDSLRegex() throws {
2929
let regex = Regex {
30-
OneOrMore("a", .reluctant)
30+
OneOrMore("aaa", .reluctant)
3131
Regex {
3232
ChoiceOf {
33-
ZeroOrMore("b")
33+
ZeroOrMore("bbb")
3434
OneOrMore("d")
3535
Repeat("e", 3...)
3636
}
3737
}.dotMatchesNewlines()
3838
Optionally("c")
3939
}.ignoresCase()
4040
let pattern = try XCTUnwrap(regex._literalPattern)
41-
XCTAssertEqual("(?i:a+?(?s:b*|d+|e{3,})c?)", pattern)
41+
XCTAssertEqual("(?i:(?:aaa)+?(?s:(?:bbb)*|d+|e{3,})c?)", pattern)
4242

4343
let nonPrintableRegex = Regex {
4444
OneOrMore("a")

Tests/RegexTests/MatchTests.swift

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,15 @@ struct MatchError: Error {
2525
// literal patterns round trip correctly.
2626
func _roundTripLiteral(
2727
_ regexStr: String,
28-
syntax: SyntaxOptions = .traditional
29-
) throws {
28+
syntax: SyntaxOptions
29+
) throws -> Regex<AnyRegexOutput>? {
3030
guard let pattern = try Regex(regexStr, syntax: syntax)._literalPattern else {
31-
return
31+
return nil
3232
}
3333

3434
let remadeRegex = try Regex(pattern)
3535
XCTAssertEqual(pattern, remadeRegex._literalPattern)
36+
return remadeRegex
3637
}
3738

3839
func _firstMatch(
@@ -44,7 +45,33 @@ func _firstMatch(
4445
) throws -> (String, [String?])? {
4546
var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel)
4647
let result = try regex.firstMatch(in: input)
47-
try _roundTripLiteral(regexStr, syntax: syntax)
48+
do {
49+
let roundTripRegex = try? _roundTripLiteral(regexStr, syntax: syntax)
50+
let roundTripResult = try? roundTripRegex?
51+
.matchingSemantics(semanticLevel)
52+
.firstMatch(in: input)?[0]
53+
.substring
54+
switch (result?[0].substring, roundTripResult) {
55+
case let (match?, rtMatch?):
56+
XCTAssertEqual(match, rtMatch)
57+
case (nil, nil):
58+
break // okay
59+
case let (match?, _):
60+
XCTFail("""
61+
Didn't match in round-tripped version of '\(regexStr)'
62+
For input '\(input)'
63+
Original: '\(regexStr)'
64+
_literalPattern: '\(roundTripRegex?._literalPattern ?? "<no pattern>")'
65+
""")
66+
case let (_, rtMatch?):
67+
XCTFail("""
68+
Incorrectly matched as '\(rtMatch)'
69+
For input '\(input)'
70+
Original: '\(regexStr)'
71+
_literalPattern: '\(roundTripRegex!._literalPattern!)'
72+
""")
73+
}
74+
}
4875

4976
if validateOptimizations {
5077
assert(regex._forceAction(.addOptions(.disableOptimizations)))

0 commit comments

Comments
 (0)