From 657c4a6617da007fdf8f47c0b7ffc99ec22a8d9a Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 13 Apr 2022 15:25:56 +0100 Subject: [PATCH] Allow POSIX character properties outside of custom character classes This matches the ICU behavior, and appears to be suggested by UTS#18. --- .../Regex/Parse/LexicalAnalysis.swift | 17 +++++++---------- Sources/_RegexParser/Regex/Parse/Parse.swift | 4 ++-- .../_StringProcessing/Utility/ASTBuilder.swift | 5 +++++ Tests/RegexTests/ParseTests.swift | 11 ++++++++--- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index bf53d0ab1..f70989c9f 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1064,14 +1064,13 @@ extension Source { } mutating func lexCustomCCStart( - context: ParsingContext ) throws -> Located? { recordLoc { src in // Make sure we don't have a POSIX character property. This may require // walking to its ending to make sure we have a closing ':]', as otherwise // we have a custom character class. // TODO: This behavior seems subtle, could we warn? - guard !src.canLexPOSIXCharacterProperty(context: context) else { + guard !src.canLexPOSIXCharacterProperty() else { return nil } if src.tryEat("[") { @@ -1104,11 +1103,8 @@ extension Source { } private mutating func lexPOSIXCharacterProperty( - context: ParsingContext ) throws -> Located? { - // Only allowed in a custom character class. - guard context.isInCustomCharacterClass else { return nil } - return try recordLoc { src in + try recordLoc { src in try src.tryEating { src in guard src.tryEat(sequence: "[:") else { return nil } let inverted = src.tryEat("^") @@ -1127,10 +1123,10 @@ extension Source { } } - private func canLexPOSIXCharacterProperty(context: ParsingContext) -> Bool { + private func canLexPOSIXCharacterProperty() -> Bool { do { var src = self - return try src.lexPOSIXCharacterProperty(context: context) != nil + return try src.lexPOSIXCharacterProperty() != nil } catch { // We want to tend on the side of lexing a POSIX character property, so // even if it is invalid in some way (e.g invalid property names), still @@ -1818,8 +1814,9 @@ extension Source { if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil } // TODO: Store customCC in the atom, if that's useful - // POSIX character property. - if let prop = try src.lexPOSIXCharacterProperty(context: context)?.value { + // POSIX character property. Like \p{...} this is also allowed outside of + // a custom character class. + if let prop = try src.lexPOSIXCharacterProperty()?.value { return .property(prop) } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index d70a1f14f..c3aa3500b 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -403,7 +403,7 @@ extension Parser { } // Check if we have the start of a custom character class '['. - if let cccStart = try source.lexCustomCCStart(context: context) { + if let cccStart = try source.lexCustomCCStart() { return .customCharacterClass( try parseCustomCharacterClass(cccStart)) } @@ -487,7 +487,7 @@ extension Parser { while source.peek() != "]" && source.peekCCBinOp() == nil { // Nested custom character class. - if let cccStart = try source.lexCustomCCStart(context: context) { + if let cccStart = try source.lexCustomCCStart() { members.append(.custom(try parseCustomCharacterClass(cccStart))) continue } diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 7a8edcd24..107a3e3c9 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -354,6 +354,11 @@ func prop( ) -> AST.Node { atom(.property(.init(kind, isInverted: inverted, isPOSIX: false))) } +func posixProp( + _ kind: AST.Atom.CharacterProperty.Kind, inverted: Bool = false +) -> AST.Node { + atom(.property(.init(kind, isInverted: inverted, isPOSIX: true))) +} // Raw atom constructing variant func atom_a( diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index aa0ebbf98..e9b422379 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -477,12 +477,9 @@ extension RegexTests { // These are custom character classes, not invalid POSIX character classes. // TODO: This behavior is subtle, we ought to warn. parseTest("[[:space]]", charClass(charClass(":", "s", "p", "a", "c", "e"))) - parseTest("[:space:]", charClass(":", "s", "p", "a", "c", "e", ":")) parseTest("[:a]", charClass(":", "a")) parseTest("[a:]", charClass("a", ":")) parseTest("[:]", charClass(":")) - parseTest("[::]", charClass(":", ":")) - parseTest("[:=:]", charClass(":", "=", ":")) parseTest("[[:]]", charClass(charClass(":"))) parseTest("[[:a=b=c:]]", charClass(charClass(":", "a", "=", "b", "=", "c", ":"))) @@ -522,6 +519,12 @@ extension RegexTests { posixProp_m(.binary(.uppercase), inverted: true), "c", "d")) + // Like ICU, we allow POSIX character properties outside of custom character + // classes. This also appears to be suggested by UTS#18. + // TODO: We should likely emit a warning. + parseTest("[:space:]", posixProp(.binary(.whitespace))) + parseTest("[:script=Greek:]", posixProp(.script(.greek))) + parseTest("[[[:space:]]]", charClass(charClass( posixProp_m(.binary(.whitespace)) ))) @@ -2252,6 +2255,8 @@ extension RegexTests { diagnosticTest("[[:a:", .expected("]")) diagnosticTest("[[:a[:]", .expected("]")) + diagnosticTest("[::]", .emptyProperty) + diagnosticTest("[:=:]", .emptyProperty) diagnosticTest("[[::]]", .emptyProperty) diagnosticTest("[[:=:]]", .emptyProperty)