diff --git a/Package.swift b/Package.swift index f9eb95e8e..526d1d0e1 100644 --- a/Package.swift +++ b/Package.swift @@ -7,10 +7,18 @@ let availabilityDefinition = PackageDescription.SwiftSetting.unsafeFlags([ "-Xfrontend", "-define-availability", "-Xfrontend", - #"SwiftStdlib 5.7:macOS 9999, iOS 9999, watchOS 9999, tvOS 9999"#, + "SwiftStdlib 5.7:macOS 9999, iOS 9999, watchOS 9999, tvOS 9999", ]) -let stdlibSettings: [PackageDescription.SwiftSetting] = [ +/// Swift settings for building a private stdlib-like module that is to be used +/// by other stdlib-like modules only. +let privateStdlibSettings: [PackageDescription.SwiftSetting] = [ + .unsafeFlags(["-Xfrontend", "-disable-implicit-concurrency-module-import"]), + .unsafeFlags(["-Xfrontend", "-disable-implicit-string-processing-module-import"]), +] + +/// Swift settings for building a user-facing stdlib-like module. +let publicStdlibSettings: [PackageDescription.SwiftSetting] = [ .unsafeFlags(["-enable-library-evolution"]), .unsafeFlags(["-Xfrontend", "-disable-implicit-concurrency-module-import"]), .unsafeFlags(["-Xfrontend", "-disable-implicit-string-processing-module-import"]), @@ -43,7 +51,7 @@ let package = Package( .target( name: "_RegexParser", dependencies: [], - swiftSettings: stdlibSettings), + swiftSettings: privateStdlibSettings), .testTarget( name: "MatchingEngineTests", dependencies: [ @@ -55,11 +63,11 @@ let package = Package( .target( name: "_StringProcessing", dependencies: ["_RegexParser", "_CUnicode"], - swiftSettings: stdlibSettings), + swiftSettings: publicStdlibSettings), .target( name: "RegexBuilder", dependencies: ["_StringProcessing", "_RegexParser"], - swiftSettings: stdlibSettings), + swiftSettings: publicStdlibSettings), .testTarget( name: "RegexTests", dependencies: ["_StringProcessing"], diff --git a/Sources/PatternConverter/PatternConverter.swift b/Sources/PatternConverter/PatternConverter.swift index 497d54506..57e2f31dd 100644 --- a/Sources/PatternConverter/PatternConverter.swift +++ b/Sources/PatternConverter/PatternConverter.swift @@ -70,7 +70,8 @@ struct PatternConverter: ParsableCommand { print() if !skipDSL { - let render = ast.renderAsBuilderDSL( + let render = renderAsBuilderDSL( + ast: ast, maxTopDownLevels: topDownConversionLimit, minBottomUpLevels: bottomUpConversionLimit ) diff --git a/Sources/RegexBuilder/Algorithms.swift b/Sources/RegexBuilder/Algorithms.swift index f1f6d97a0..88916879b 100644 --- a/Sources/RegexBuilder/Algorithms.swift +++ b/Sources/RegexBuilder/Algorithms.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _StringProcessing +@_spi(RegexBuilder) import _StringProcessing // FIXME(rdar://92459215): We should be using 'some RegexComponent' instead of // for the methods below that don't impose any additional @@ -313,3 +313,31 @@ where Self: BidirectionalCollection, SubSequence == Substring { try replace(content(), maxReplacements: maxReplacements, with: replacement) } } + +// String split overload breakers + +extension StringProtocol where SubSequence == Substring { + @available(SwiftStdlib 5.7, *) + public func split( + separator: String, + maxSplits: Int = .max, + omittingEmptySubsequences: Bool = true + ) -> [Substring] { + return _split( + separator: separator, + maxSplits: maxSplits, + omittingEmptySubsequences: omittingEmptySubsequences) + } + + @available(SwiftStdlib 5.7, *) + public func split( + separator: Substring, + maxSplits: Int = .max, + omittingEmptySubsequences: Bool = true + ) -> [Substring] { + return _split( + separator: separator, + maxSplits: maxSplits, + omittingEmptySubsequences: omittingEmptySubsequences) + } +} diff --git a/Sources/RegexBuilder/DSL.swift b/Sources/RegexBuilder/DSL.swift index 10590fb74..e8dffaa8e 100644 --- a/Sources/RegexBuilder/DSL.swift +++ b/Sources/RegexBuilder/DSL.swift @@ -127,6 +127,19 @@ extension DSLTree.Node { } } +/// A regex component that matches exactly one occurrence of its underlying +/// component. +@available(SwiftStdlib 5.7, *) +public struct One: RegexComponent { + public var regex: Regex + + public init( + _ component: Component + ) where Component.RegexOutput == Output { + self.regex = component.regex + } +} + @available(SwiftStdlib 5.7, *) public struct OneOrMore: _BuiltinRegexComponent { public var regex: Regex diff --git a/Sources/_RegexParser/Regex/AST/AST.swift b/Sources/_RegexParser/Regex/AST/AST.swift index be1548b72..44bc10828 100644 --- a/Sources/_RegexParser/Regex/AST/AST.swift +++ b/Sources/_RegexParser/Regex/AST/AST.swift @@ -29,7 +29,6 @@ extension AST { extension AST { /// A node in the regex AST. - @frozen public indirect enum Node: Hashable, _TreeNode //, _ASTPrintable ASTValue, ASTAction { @@ -53,6 +52,9 @@ extension AST { /// Comments, non-semantic whitespace, etc case trivia(Trivia) + /// Intepolation `<{...}>`, currently reserved for future use. + case interpolation(Interpolation) + case atom(Atom) case customCharacterClass(CustomCharacterClass) @@ -78,6 +80,7 @@ extension AST.Node { case let .quantification(v): return v case let .quote(v): return v case let .trivia(v): return v + case let .interpolation(v): return v case let .atom(v): return v case let .customCharacterClass(v): return v case let .empty(v): return v @@ -130,7 +133,7 @@ extension AST.Node { case .conditional, .customCharacterClass, .absentFunction: return true case .alternation, .concatenation, .quantification, .quote, .trivia, - .empty: + .empty, .interpolation: return false } } @@ -194,6 +197,16 @@ extension AST { } } + public struct Interpolation: Hashable, _ASTNode { + public let contents: String + public let location: SourceLocation + + public init(_ contents: String, _ location: SourceLocation) { + self.contents = contents + self.location = location + } + } + public struct Empty: Hashable, _ASTNode { public let location: SourceLocation @@ -249,7 +262,6 @@ extension AST { } public struct Reference: Hashable { - @frozen public enum Kind: Hashable { // \n \gn \g{n} \g \g'n' (?n) (?(n)... // Oniguruma: \k, \k'n' diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 19e2fb498..992604852 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -19,7 +19,6 @@ extension AST { self.location = loc } - @frozen public enum Kind: Hashable { /// Just a character /// @@ -146,7 +145,6 @@ extension AST.Atom { // Characters, character types, literals, etc., derived from // an escape sequence. - @frozen public enum EscapedBuiltin: Hashable { // TODO: better doc comments @@ -399,7 +397,6 @@ extension AST.Atom { } extension AST.Atom.CharacterProperty { - @frozen public enum Kind: Hashable { /// Matches any character, equivalent to Oniguruma's '\O'. case any @@ -430,15 +427,39 @@ extension AST.Atom.CharacterProperty { /// Character name in the form `\p{name=...}` case named(String) + /// Numeric type. + case numericType(Unicode.NumericType) + + /// Numeric value. + case numericValue(Double) + + /// Case mapping. + case mapping(MapKind, String) + + /// Canonical Combining Class. + case ccc(Unicode.CanonicalCombiningClass) + + /// Character age, as per UnicodeScalar.Properties.age. + case age(major: Int, minor: Int) + + /// A block property. + case block(Unicode.Block) + case posix(Unicode.POSIXProperty) /// Some special properties implemented by PCRE and Oniguruma. case pcreSpecial(PCRESpecialCategory) - case onigurumaSpecial(OnigurumaSpecialProperty) + + /// Some special properties implemented by Java. + case javaSpecial(JavaSpecial) + + public enum MapKind: Hashable { + case lowercase + case uppercase + case titlecase + } } - // TODO: erm, separate out or fold into something? splat it in? - @frozen public enum PCRESpecialCategory: String, Hashable { case alphanumeric = "Xan" case posixSpace = "Xps" @@ -446,11 +467,33 @@ extension AST.Atom.CharacterProperty { case universallyNamed = "Xuc" case perlWord = "Xwd" } + + /// Special Java properties that correspond to methods on + /// `java.lang.Character`, with the `java` prefix replaced by `is`. + public enum JavaSpecial: String, Hashable, CaseIterable { + case alphabetic = "javaAlphabetic" + case defined = "javaDefined" + case digit = "javaDigit" + case identifierIgnorable = "javaIdentifierIgnorable" + case ideographic = "javaIdeographic" + case isoControl = "javaISOControl" + case javaIdentifierPart = "javaJavaIdentifierPart" // not a typo, that's actually the name + case javaIdentifierStart = "javaJavaIdentifierStart" // not a typo, that's actually the name + case javaLetter = "javaLetter" + case javaLetterOrDigit = "javaLetterOrDigit" + case lowerCase = "javaLowerCase" + case mirrored = "javaMirrored" + case spaceChar = "javaSpaceChar" + case titleCase = "javaTitleCase" + case unicodeIdentifierPart = "javaUnicodeIdentifierPart" + case unicodeIdentifierStart = "javaUnicodeIdentifierStart" + case upperCase = "javaUpperCase" + case whitespace = "javaWhitespace" + } } extension AST.Atom { /// Anchors and other built-in zero-width assertions. - @frozen public enum AssertionKind: String { /// \A case startOfSubject = #"\A"# @@ -824,7 +867,7 @@ extension AST.Node { case .alternation, .concatenation, .group, .conditional, .quantification, .quote, .trivia, .customCharacterClass, .empty, - .absentFunction: + .absentFunction, .interpolation: return nil } } diff --git a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift index c1dd4c620..087387c1e 100644 --- a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift +++ b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift @@ -27,7 +27,6 @@ extension AST { self.location = sr } - @frozen public enum Member: Hashable { /// A nested custom character class `[[ab][cd]]` case custom(CustomCharacterClass) @@ -52,20 +51,23 @@ extension AST { public var lhs: Atom public var dashLoc: SourceLocation public var rhs: Atom + public var trivia: [AST.Trivia] - public init(_ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom) { + public init( + _ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom, + trivia: [AST.Trivia] + ) { self.lhs = lhs self.dashLoc = dashLoc self.rhs = rhs + self.trivia = trivia } } - @frozen public enum SetOp: String, Hashable { case subtraction = "--" case intersection = "&&" case symmetricDifference = "~~" } - @frozen public enum Start: String { case normal = "[" case inverted = "[^" @@ -98,6 +100,11 @@ extension CustomCC.Member { return false } + public var asTrivia: AST.Trivia? { + guard case .trivia(let t) = self else { return nil } + return t + } + public var isSemantic: Bool { !isTrivia } diff --git a/Sources/_RegexParser/Regex/AST/Quantification.swift b/Sources/_RegexParser/Regex/AST/Quantification.swift index fa7e4de82..c6d4f0101 100644 --- a/Sources/_RegexParser/Regex/AST/Quantification.swift +++ b/Sources/_RegexParser/Regex/AST/Quantification.swift @@ -36,7 +36,6 @@ extension AST { self.trivia = trivia } - @frozen public enum Amount: Hashable { case zeroOrMore // * case oneOrMore // + @@ -47,7 +46,6 @@ extension AST { case range(Located, Located) // {n,m} } - @frozen public enum Kind: String, Hashable { case eager = "" case reluctant = "?" diff --git a/Sources/_RegexParser/Regex/Parse/CaptureList.swift b/Sources/_RegexParser/Regex/Parse/CaptureList.swift index 0287e7337..2a5a47395 100644 --- a/Sources/_RegexParser/Regex/Parse/CaptureList.swift +++ b/Sources/_RegexParser/Regex/Parse/CaptureList.swift @@ -42,6 +42,21 @@ extension CaptureList { } } +extension CaptureList { + /// Retrieve the capture index of a given named capture, or `nil` if there is + /// no such capture. + public func indexOfCapture(named name: String) -> Int? { + // Named references are guaranteed to be unique for literal ASTs by Sema. + // The DSL tree does not use named references. + captures.indices.first(where: { captures[$0].name == name }) + } + + /// Whether the capture list has a given named capture. + public func hasCapture(named name: String) -> Bool { + indexOfCapture(named: name) != nil + } +} + // MARK: Generating from AST extension AST.Node { @@ -103,7 +118,7 @@ extension AST.Node { break } - case .quote, .trivia, .atom, .customCharacterClass, .empty: + case .quote, .trivia, .atom, .customCharacterClass, .empty, .interpolation: break } } diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index c0ece78ff..fb122e027 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -13,17 +13,21 @@ extension Source { typealias PropertyKind = AST.Atom.CharacterProperty.Kind static private func withNormalizedForms( - _ str: String, match: (String) -> T? - ) -> T? { + _ str: String, requireInPrefix: Bool = false, match: (String) throws -> T? + ) rethrows -> T? { // This follows the rules provided by UAX44-LM3, including trying to drop an // "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for // consistency with other engines and the Unicode.Scalar.Properties names. let str = str.filter { !$0.isPatternWhitespace && $0 != "_" && $0 != "-" } .lowercased() - if let m = match(str) { + if requireInPrefix { + guard str.hasPrefix("in") else { return nil } + return try match(String(str.dropFirst(2))) + } + if let m = try match(str) { return m } - if str.hasPrefix("is"), let m = match(String(str.dropFirst(2))) { + if str.hasPrefix("is"), let m = try match(String(str.dropFirst(2))) { return m } return nil @@ -79,6 +83,19 @@ extension Source { } } + static private func classifyNumericType( + _ str: String + ) -> Unicode.NumericType? { + withNormalizedForms(str) { str in + switch str { + case "decimal": return .decimal + case "digit": return .digit + case "numeric": return .numeric + default: return nil + } + } + } + static private func classifyBoolProperty( _ str: String ) -> Unicode.BinaryProperty? { @@ -351,6 +368,342 @@ extension Source { } } + static private func classifyBlockProperty( + _ value: String, valueOnly: Bool + ) -> Unicode.Block? { + // Require an 'in' prefix for the shorthand variant. This is supported by + // Oniguruma and Perl. + // TODO: Perl discourages the shorthand 'in' prefix, should we diagnose and + // suggest an explicit key/value? + withNormalizedForms(value, requireInPrefix: valueOnly) { str in + switch str { + case "adlam": return .adlam + case "aegeannumbers": return .aegeanNumbers + case "ahom": return .ahom + case "alchemical", "alchemicalsymbols": return .alchemicalSymbols + case "alphabeticpf", "alphabeticpresentationforms": return .alphabeticPresentationForms + case "anatolianhieroglyphs": return .anatolianHieroglyphs + case "ancientgreekmusic", "ancientgreekmusicalnotation": return .ancientGreekMusicalNotation + case "ancientgreeknumbers": return .ancientGreekNumbers + case "ancientsymbols": return .ancientSymbols + case "arabic": return .arabic + case "arabicexta", "arabicextendeda": return .arabicExtendedA + case "arabicextb", "arabicextendedb": return .arabicExtendedB + case "arabicmath", "arabicmathematicalalphabeticsymbols": return .arabicMathematicalAlphabeticSymbols + case "arabicpfa", "arabicpresentationformsa": return .arabicPresentationFormsA + case "arabicpfb", "arabicpresentationformsb": return .arabicPresentationFormsB + case "arabicsup", "arabicsupplement": return .arabicSupplement + case "armenian": return .armenian + case "arrows": return .arrows + case "ascii", "basiclatin": return .basicLatin + case "avestan": return .avestan + case "balinese": return .balinese + case "bamum": return .bamum + case "bamumsup", "bamumsupplement": return .bamumSupplement + case "bassavah": return .bassaVah + case "batak": return .batak + case "bengali": return .bengali + case "bhaiksuki": return .bhaiksuki + case "blockelements": return .blockElements + case "bopomofo": return .bopomofo + case "bopomofoext", "bopomofoextended": return .bopomofoExtended + case "boxdrawing": return .boxDrawing + case "brahmi": return .brahmi + case "braille", "braillepatterns": return .braillePatterns + case "buginese": return .buginese + case "buhid": return .buhid + case "byzantinemusic", "byzantinemusicalsymbols": return .byzantineMusicalSymbols + case "carian": return .carian + case "caucasianalbanian": return .caucasianAlbanian + case "chakma": return .chakma + case "cham": return .cham + case "cherokee": return .cherokee + case "cherokeesup", "cherokeesupplement": return .cherokeeSupplement + case "chesssymbols": return .chessSymbols + case "chorasmian": return .chorasmian + case "cjk", "cjkunifiedideographs": return .cjkUnifiedIdeographs + case "cjkcompat", "cjkcompatibility": return .cjkCompatibility + case "cjkcompatforms", "cjkcompatibilityforms": return .cjkcompatibilityForms + case "cjkcompatideographs", "cjkcompatibilityideographs": return .cjkCompatibilityIdeographs + case "cjkcompatideographssup", "cjkcompatibilityideographssupplement": return .cjkCompatibilityIdeographsSupplement + case "cjkexta", "cjkunifiedideographsextensiona": return .cjkUnifiedIdeographsExtensionA + case "cjkextb", "cjkunifiedideographsextensionb": return .cjkUnifiedIdeographsExtensionB + case "cjkextc", "cjkunifiedideographsextensionc": return .cjkUnifiedIdeographsExtensionC + case "cjkextd", "cjkunifiedideographsextensiond": return .cjkUnifiedIdeographsExtensionD + case "cjkexte", "cjkunifiedideographsextensione": return .cjkUnifiedIdeographsExtensionE + case "cjkextf", "cjkunifiedideographsextensionf": return .cjkUnifiedIdeographsExtensionF + case "cjkextg", "cjkunifiedideographsextensiong": return .cjkUnifiedIdeographsExtensionG + case "cjkradicalssup", "cjkradicalssupplement": return .cjkRadicalsSupplement + case "cjkstrokes": return .cjkStrokes + case "cjksymbols", "cjksymbolsandpunctuation": return .cjkSymbolsAndPunctuation + case "compatjamo", "hangulcompatibilityjamo": return .hangulCompatibilityJamo + case "controlpictures": return .controlPictures + case "coptic": return .coptic + case "copticepactnumbers": return .copticEpactNumbers + case "countingrod", "countingrodnumerals": return .countingRodNumerals + case "cuneiform": return .cuneiform + case "cuneiformnumbers", "cuneiformnumbersandpunctuation": return .cuneiformNumbersAndPunctuation + case "currencysymbols": return .currencySymbols + case "cypriotsyllabary": return .cypriotSyllabary + case "cyprominoan": return .cyproMinoan + case "cyrillic": return .cyrillic + case "cyrillicexta", "cyrillicextendeda": return .cyrillicExtendedA + case "cyrillicextb", "cyrillicextendedb": return .cyrillicExtendedB + case "cyrillicextc", "cyrillicextendedc": return .cyrillicExtendedC + case "cyrillicsup", "cyrillicsupplement", "cyrillicsupplementary": return .cyrillicSupplement + case "deseret": return .deseret + case "devanagari": return .devanagari + case "devanagariext", "devanagariextended": return .devanagariExtended + case "diacriticals", "combiningdiacriticalmarks": return .combiningDiacriticalMarks + case "diacriticalsext", "combiningdiacriticalmarksextended": return .combiningDiacriticalMarksExtended + case "diacriticalsforsymbols", "combiningdiacriticalmarksforsymbols", + "combiningmarksforsymbols": return .combiningDiacriticalMarksForSymbols + case "diacriticalssup", "combiningdiacriticalmarkssupplement": return .combiningDiacriticalMarksSupplement + case "dingbats": return .dingbats + case "divesakuru": return .divesAkuru + case "dogra": return .dogra + case "domino", "dominotiles": return .dominoTiles + case "duployan": return .duployan + case "earlydynasticcuneiform": return .earlyDynasticCuneiform + case "egyptianhieroglyphformatcontrols": return .egyptianHieroglyphFormatControls + case "egyptianhieroglyphs": return .egyptianHieroglyphs + case "elbasan": return .elbasan + case "elymaic": return .elymaic + case "emoticons": return .emoticons + case "enclosedalphanum", "enclosedalphanumerics": return .enclosedAlphanumerics + case "enclosedalphanumsup", "enclosedalphanumericsupplement": return .enclosedAlphanumericSupplement + case "enclosedcjk", "enclosedcjklettersandmonths": return .enclosedCJKLettersAndMonths + case "enclosedideographicsup", "enclosedideographicsupplement": return .enclosedIdeographicSupplement + case "ethiopic": return .ethiopic + case "ethiopicext", "ethiopicextended": return .ethiopicExtended + case "ethiopicexta", "ethiopicextendeda": return .ethiopicExtendedA + case "ethiopicextb", "ethiopicextendedb": return .ethiopicExtendedB + case "ethiopicsup", "ethiopicsupplement": return .ethiopicSupplement + case "geometricshapes": return .geometricShapes + case "geometricshapesext", "geometricshapesextended": return .geometricShapesExtended + case "georgian": return .georgian + case "georgianext", "georgianextended": return .georgianExtended + case "georgiansup", "georgiansupplement": return .georgianSupplement + case "glagolitic": return .glagolitic + case "glagoliticsup", "glagoliticsupplement": return .glagoliticSupplement + case "gothic": return .gothic + case "grantha": return .grantha + case "greek", "greekandcoptic": return .greekAndCoptic + case "greekext", "greekextended": return .greekExtended + case "gujarati": return .gujarati + case "gunjalagondi": return .gunjalaGondi + case "gurmukhi": return .gurmukhi + case "halfandfullforms", "halfwidthandfullwidthforms": return .halfwidthAndFullwidthForms + case "halfmarks", "combininghalfmarks": return .combiningHalfMarks + case "hangul", "hangulsyllables": return .hangulSyllables + case "hanifirohingya": return .hanifiRohingya + case "hanunoo": return .hanunoo + case "hatran": return .hatran + case "hebrew": return .hebrew + case "highpusurrogates", "highprivateusesurrogates": return .highPrivateUseSurrogates + case "highsurrogates": return .highSurrogates + case "hiragana": return .hiragana + case "idc", "ideographicdescriptioncharacters": return .ideographicDescriptionCharacters + case "ideographicsymbols", "ideographicsymbolsandpunctuation": return .ideographicSymbolsAndPunctuation + case "imperialaramaic": return .imperialAramaic + case "indicnumberforms", "commonindicnumberforms": return .commonIndicNumberForms + case "indicsiyaqnumbers": return .indicSiyaqNumbers + case "inscriptionalpahlavi": return .inscriptionalPahlavi + case "inscriptionalparthian": return .inscriptionalParthian + case "ipaext", "ipaextensions": return .ipaExtensions + case "jamo", "hanguljamo": return .hangulJamo + case "jamoexta", "hanguljamoextendeda": return .hangulJamoExtendedA + case "jamoextb", "hanguljamoextendedb": return .hangulJamoExtendedB + case "javanese": return .javanese + case "kaithi": return .kaithi + case "kanaexta", "kanaextendeda": return .kanaExtendedA + case "kanaextb", "kanaextendedb": return .kanaExtendedB + case "kanasup", "kanasupplement": return .kanaSupplement + case "kanbun": return .kanbun + case "kangxi", "kangxiradicals": return .kangxiRadicals + case "kannada": return .kannada + case "katakana": return .katakana + case "katakanaext", "katakanaphoneticextensions": return .katakanaPhoneticExtensions + case "kayahli": return .kayahLi + case "kharoshthi": return .kharoshthi + case "khitansmallscript": return .khitanSmallScript + case "khmer": return .khmer + case "khmersymbols": return .khmerSymbols + case "khojki": return .khojki + case "khudawadi": return .khudawadi + case "lao": return .lao + case "latin1sup", "latin1supplement", "latin1": return .latin1Supplement + case "latinexta", "latinextendeda": return .latinExtendedA + case "latinextadditional", "latinextendedadditional": return .latinExtendedAdditional + case "latinextb", "latinextendedb": return .latinExtendedB + case "latinextc", "latinextendedc": return .latinExtendedC + case "latinextd", "latinextendedd": return .latinExtendedD + case "latinexte", "latinextendede": return .latinExtendedE + case "latinextf", "latinextendedf": return .latinExtendedF + case "latinextg", "latinextendedg": return .latinExtendedG + case "lepcha": return .lepcha + case "letterlikesymbols": return .letterLikeSymbols + case "limbu": return .limbu + case "lineara": return .linearA + case "linearbideograms": return .linearBIdeograms + case "linearbsyllabary": return .linearBSyllabary + case "lisu": return .lisu + case "lisusup", "lisusupplement": return .lisuSupplement + case "lowsurrogates": return .lowSurrogates + case "lycian": return .lycian + case "lydian": return .lydian + case "mahajani": return .mahajani + case "mahjong", "mahjongtiles": return .mahjongTiles + case "makasar": return .makasar + case "malayalam": return .malayalam + case "mandaic": return .mandaic + case "manichaean": return .manichaean + case "marchen": return .marchen + case "masaramgondi": return .masaramGondi + case "mathalphanum", "mathematicalalphanumericsymbols": return .mathematicalAlphanumericSymbols + case "mathoperators", "mathematicaloperators": return .mathematicalOperators + case "mayannumerals": return .mayanNumerals + case "medefaidrin": return .medefaidrin + case "meeteimayek": return .meeteiMayek + case "meeteimayekext", "meeteimayekextensions": return .meeteiMayekExtensions + case "mendekikakui": return .mendeKikakui + case "meroiticcursive": return .meroiticCursive + case "meroitichieroglyphs": return .meroiticHieroglyphs + case "miao": return .miao + case "miscarrows", "miscellaneoussymbolsandarrows": return .miscellaneousSymbolsAndArrows + case "miscmathsymbolsa", "miscellaneousmathematicalsymbolsa": return .miscellaneousMathematicalSymbolsA + case "miscmathsymbolsb", "miscellaneousmathematicalsymbolsb": return .miscellaneousMathematicalSymbolsB + case "miscpictographs", "miscellaneoussymbolsandpictographs": return .miscellaneousSymbolsandPictographs + case "miscsymbols", "miscellaneoussymbols": return .miscellaneousSymbols + case "misctechnical", "miscellaneoustechnical": return .miscellaneousTechnical + case "modi": return .modi + case "modifierletters", "spacingmodifierletters": return .spacingModifierLetters + case "modifiertoneletters": return .modifierToneLetters + case "mongolian": return .mongolian + case "mongoliansup", "mongoliansupplement": return .mongolianSupplement + case "mro": return .mro + case "multani": return .multani + case "music", "musicalsymbols": return .musicalSymbols + case "myanmar": return .myanmar + case "myanmarexta", "myanmarextendeda": return .myanmarExtendedA + case "myanmarextb", "myanmarextendedb": return .myanmarExtendedB + case "nabataean": return .nabataean + case "nandinagari": return .nandinagari + case "nb", "noblock": return .noBlock + case "newtailue": return .newTailue + case "newa": return .newa + case "nko": return .nko + case "numberforms": return .numberForms + case "nushu": return .nushu + case "nyiakengpuachuehmong": return .nyiakengPuachueHmong + case "ocr", "opticalcharacterrecognition": return .opticalCharacterRecognition + case "ogham": return .ogham + case "olchiki": return .olChiki + case "oldhungarian": return .oldHungarian + case "olditalic": return .oldItalic + case "oldnortharabian": return .oldNorthArabian + case "oldpermic": return .oldPermic + case "oldpersian": return .oldPersian + case "oldsogdian": return .oldSogdian + case "oldsoutharabian": return .oldSouthArabian + case "oldturkic": return .oldTurkic + case "olduyghur": return .oldUyghur + case "oriya": return .oriya + case "ornamentaldingbats": return .ornamentalDingbats + case "osage": return .osage + case "osmanya": return .osmanya + case "ottomansiyaqnumbers": return .ottomanSiyaqNumbers + case "pahawhhmong": return .pahawhHmong + case "palmyrene": return .palmyrene + case "paucinhau": return .pauCinHau + case "phagspa": return .phagsPA + case "phaistos", "phaistosdisc": return .phaistosDisc + case "phoenician": return .phoenician + case "phoneticext", "phoneticextensions": return .phoneticExtensions + case "phoneticextsup", "phoneticextensionssupplement": return .phoneticExtensionsSupplement + case "playingcards": return .playingCards + case "psalterpahlavi": return .psalterPahlavi + case "pua", "privateusearea", "privateuse": return .privateUseArea + case "punctuation", "generalpunctuation": return .generalPunctuation + case "rejang": return .rejang + case "rumi", "ruminumeralsymbols": return .rumiNumeralSymbols + case "runic": return .runic + case "samaritan": return .samaritan + case "saurashtra": return .saurashtra + case "sharada": return .sharada + case "shavian": return .shavian + case "shorthandformatcontrols": return .shorthandFormatControls + case "siddham": return .siddham + case "sinhala": return .sinhala + case "sinhalaarchaicnumbers": return .sinhalaArchaicNumbers + case "smallforms", "smallformvariants": return .smallFormVariants + case "smallkanaext", "smallkanaextension": return .smallKanaExtension + case "sogdian": return .sogdian + case "sorasompeng": return .soraSompeng + case "soyombo": return .soyombo + case "specials": return .specials + case "sundanese": return .sundanese + case "sundanesesup", "sundanesesupplement": return .sundaneseSupplement + case "suparrowsa", "supplementalarrowsa": return .supplementalArrowsA + case "suparrowsb", "supplementalarrowsb": return .supplementalArrowsB + case "suparrowsc", "supplementalarrowsc": return .supplementalArrowsC + case "supmathoperators", "supplementalmathematicaloperators": return .supplementalMathematicalOperators + case "suppuaa", "supplementaryprivateuseareaa": return .supplementaryPrivateUseAreaA + case "suppuab", "supplementaryprivateuseareab": return .supplementaryPrivateUseAreaB + case "suppunctuation", "supplementalpunctuation": return .supplementalPunctuation + case "supsymbolsandpictographs", "supplementalsymbolsandpictographs": return .supplementalSymbolsAndPictographs + case "superandsub", "superscriptsandsubscripts": return .superscriptsAndSubscripts + case "suttonsignwriting": return .suttonSignwriting + case "sylotinagri": return .sylotiNagri + case "symbolsandpictographsexta", "symbolsandpictographsextendeda": return .symbolsAndPictographsExtendedA + case "symbolsforlegacycomputing": return .symbolsForLegacyComputing + case "syriac": return .syriac + case "syriacsup", "syriacsupplement": return .syriacSupplement + case "tagalog": return .tagalog + case "tagbanwa": return .tagbanwa + case "tags": return .tags + case "taile": return .taiLe + case "taitham": return .taiTham + case "taiviet": return .taiViet + case "taixuanjing", "taixuanjingsymbols": return .taiXuanJingSymbols + case "takri": return .takri + case "tamil": return .tamil + case "tamilsup", "tamilsupplement": return .tamilSupplement + case "tangsa": return .tangsa + case "tangut": return .tangut + case "tangutcomponents": return .tangutComponents + case "tangutsup", "tangutsupplement": return .tangutSupplement + case "telugu": return .telugu + case "thaana": return .thaana + case "thai": return .thai + case "tibetan": return .tibetan + case "tifinagh": return .tifinagh + case "tirhuta": return .tirhuta + case "toto": return .toto + case "transportandmap", "transportandmapsymbols": return .transportAndMapSymbols + case "ucas", "unifiedcanadianaboriginalsyllabics", "canadiansyllabics": return .unifiedCanadianAboriginalSyllabics + case "ucasext", "unifiedcanadianaboriginalsyllabicsextended": return .unifiedCanadianAboriginalSyllabicsExtended + case "ucasexta", "unifiedcanadianaboriginalsyllabicsextendeda": return .unifiedCanadianAboriginalSyllabicsExtendedA + case "ugaritic": return .ugaritic + case "vai": return .vai + case "vedicext", "vedicextensions": return .vedicExtensions + case "verticalforms": return .verticalForms + case "vithkuqi": return .vithkuqi + case "vs", "variationselectors": return .variationSelectors + case "vssup", "variationselectorssupplement": return .variationSelectorsSupplement + case "wancho": return .wancho + case "warangciti": return .warangCiti + case "yezidi": return .yezidi + case "yiradicals": return .yiRadicals + case "yisyllables": return .yiSyllables + case "yijing", "yijinghexagramsymbols": return .yijingHexagramSymbols + case "zanabazarsquare": return .zanabazarSquare + case "znamennymusic", "znamennymusicalnotation": return .znamennyMusicalNotation + default: return nil + } + } + } + static func classifySpecialPropValue(_ value: String) -> PropertyKind? { withNormalizedForms(value) { str in switch str { @@ -361,6 +714,27 @@ extension Source { } } } + + static func parseAge(_ value: String) -> Unicode.Version? { + // Age can be specified in the form '3.0' or 'V3_0'. + // Other formats are not supported. + var str = value[...] + + let separator: Character + if str.first == "V" { + str.removeFirst() + separator = "_" + } else { + separator = "." + } + + guard let sepIndex = str.firstIndex(of: separator), + let major = Int(str[.. PropertyKind? in - switch key { + let match = try withNormalizedForms(key) { normalizedKey -> PropertyKind? in + switch normalizedKey { case "script", "sc": - if let script = classifyScriptProperty(value) { - return .script(script) + guard let script = classifyScriptProperty(value) else { + throw ParseError.unrecognizedScript(value) } + return .script(script) case "scriptextensions", "scx": - if let script = classifyScriptProperty(value) { - return .scriptExtension(script) + guard let script = classifyScriptProperty(value) else { + throw ParseError.unrecognizedScript(value) } + return .scriptExtension(script) case "gc", "generalcategory": - if let cat = classifyGeneralCategory(value) { - return .generalCategory(cat) + guard let cat = classifyGeneralCategory(value) else { + throw ParseError.unrecognizedCategory(value) } + return .generalCategory(cat) + case "age": + guard let (major, minor) = parseAge(value) else { + throw ParseError.invalidAge(value) + } + return .age(major: major, minor: minor) case "name", "na": return .named(value) + case "numericvalue", "nv": + guard let numericValue = Double(value) else { + throw ParseError.invalidNumericValue(value) + } + return .numericValue(numericValue) + case "numerictype", "nt": + guard let type = classifyNumericType(value) else { + throw ParseError.unrecognizedNumericType(value) + } + return .numericType(type) + case "slc", "simplelowercasemapping": + return .mapping(.lowercase, value) + case "suc", "simpleuppercasemapping": + return .mapping(.uppercase, value) + case "stc", "simpletitlecasemapping": + return .mapping(.titlecase, value) + case "ccc", "canonicalcombiningclass": + guard let cccValue = UInt8(value), cccValue <= 254 else { + throw ParseError.invalidCCC(value) + } + return .ccc(.init(rawValue: cccValue)) + + case "blk", "block": + guard let block = classifyBlockProperty(value, valueOnly: false) else { + throw ParseError.unrecognizedBlock(value) + } + return .block(block) default: break } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index d87fba918..d00862e9b 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -42,6 +42,7 @@ enum ParseError: Error, Hashable { case expectedNonEmptyContents case expectedEscape case invalidEscape(Character) + case confusableCharacter(Character) case cannotReferToWholePattern @@ -59,7 +60,14 @@ enum ParseError: Error, Hashable { case emptyProperty case unknownProperty(key: String?, value: String) - + case unrecognizedScript(String) + case unrecognizedCategory(String) + case unrecognizedBlock(String) + case invalidAge(String) + case invalidNumericValue(String) + case unrecognizedNumericType(String) + case invalidCCC(String) + case expectedGroupSpecifier case unbalancedEndOfGroup @@ -79,6 +87,7 @@ enum ParseError: Error, Hashable { case unsupported(String) case deprecatedUnicode(String) case invalidReference(Int) + case invalidNamedReference(String) case duplicateNamedCapture(String) case invalidCharacterClassRangeOperand case invalidQuantifierRange(Int, Int) @@ -128,6 +137,8 @@ extension ParseError: CustomStringConvertible { return "expected escape sequence" case .invalidEscape(let c): return "invalid escape sequence '\\\(c)'" + case .confusableCharacter(let c): + return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" case .quantifierRequiresOperand(let q): @@ -181,6 +192,20 @@ extension ParseError: CustomStringConvertible { return "extended syntax may not be disabled in multi-line mode" case .expectedCalloutArgument: return "expected argument to callout" + case .unrecognizedScript(let value): + return "unrecognized script '\(value)'" + case .unrecognizedCategory(let value): + return "unrecognized category '\(value)'" + case .unrecognizedBlock(let value): + return "unrecognized block '\(value)'" + case .unrecognizedNumericType(let value): + return "unrecognized numeric type '\(value)'" + case .invalidAge(let value): + return "invalid age format for '\(value)' - use '3.0' or 'V3_0' formats" + case .invalidNumericValue(let value): + return "invalid numeric value '\(value)'" + case .invalidCCC(let value): + return "invalid canonical combining class '\(value)'" // MARK: Semantic Errors @@ -190,6 +215,8 @@ extension ParseError: CustomStringConvertible { return "\(kind) is a deprecated Unicode property, and is not supported" case let .invalidReference(i): return "no capture numbered \(i)" + case let .invalidNamedReference(name): + return "no capture named '\(name)'" case let .duplicateNamedCapture(str): return "group named '\(str)' already exists" case let .invalidQuantifierRange(lhs, rhs): diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index e8783dc86..a6dfa0ce9 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -149,6 +149,14 @@ extension Source { return result } + /// Perform a lookahead using a temporary source. Within the body of the + /// lookahead, any modifications to the source will not be reflected outside + /// the body. + func lookahead(_ body: (inout Source) throws -> T) rethrows -> T { + var src = self + return try body(&src) + } + /// Attempt to eat the given character, returning its source location if /// successful, `nil` otherwise. mutating func tryEatWithLoc(_ c: Character) -> SourceLocation? { @@ -413,9 +421,7 @@ extension Source { ) throws -> (Located, Located, [AST.Trivia])? { var trivia: [AST.Trivia] = [] - if let t = try lexNonSemanticWhitespace(context: context) { - trivia.append(t) - } + if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) } let amt: Located? = try recordLoc { src in if src.tryEat("*") { return .zeroOrMore } @@ -424,7 +430,7 @@ extension Source { return try src.tryEating { src in guard src.tryEat("{"), - let range = try src.lexRange(context: context), + let range = try src.lexRange(context: context, trivia: &trivia), src.tryEat("}") else { return nil } return range.value @@ -433,9 +439,7 @@ extension Source { guard let amt = amt else { return nil } // PCRE allows non-semantic whitespace here in extended syntax mode. - if let t = try lexNonSemanticWhitespace(context: context) { - trivia.append(t) - } + if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) } let kind: Located = recordLoc { src in if src.tryEat("?") { return .reluctant } @@ -452,11 +456,17 @@ extension Source { /// | ExpRange /// ExpRange -> '..<' | '...' /// | '..<' | '...' ? - mutating func lexRange(context: ParsingContext) throws -> Located? { + mutating func lexRange( + context: ParsingContext, trivia: inout [AST.Trivia] + ) throws -> Located? { try recordLoc { src in try src.tryEating { src in + if let t = src.lexWhitespace() { trivia.append(t) } + let lowerOpt = try src.lexNumber() + if let t = src.lexWhitespace() { trivia.append(t) } + // ',' or '...' or '..<' or nothing // TODO: We ought to try and consume whitespace here and emit a // diagnostic for the user warning them that it would cause the range to @@ -476,11 +486,15 @@ extension Source { closedRange = nil } + if let t = src.lexWhitespace() { trivia.append(t) } + let upperOpt = try src.lexNumber()?.map { upper in // If we have an open range, the upper bound should be adjusted down. closedRange == true ? upper : upper - 1 } + if let t = src.lexWhitespace() { trivia.append(t) } + switch (lowerOpt, closedRange, upperOpt) { case let (l?, nil, nil): return .exactly(l) @@ -589,6 +603,26 @@ extension Source { return AST.Quote(str.value, str.location) } + /// Try to consume an interpolation sequence. + /// + /// Interpolation -> '<{' String '}>' + /// + mutating func lexInterpolation() throws -> AST.Interpolation? { + let contents = try recordLoc { src -> String? in + try src.tryEating { src in + guard src.tryEat(sequence: "<{") else { return nil } + _ = src.lexUntil { $0.isEmpty || $0.starts(with: "}>") } + guard src.tryEat(sequence: "}>") else { return nil } + + // Not currently supported. We error here instead of during Sema to + // get a better error for something like `(<{)}>`. + throw ParseError.unsupported("interpolation") + } + } + guard let contents = contents else { return nil } + return .init(contents.value, contents.location) + } + /// Try to consume a comment /// /// Comment -> '(?#' [^')']* ')' @@ -605,11 +639,11 @@ extension Source { /// mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? { let trivia: Located? = try recordLoc { src in - if src.tryEat(sequence: "(?#") { - return try src.expectQuoted(endingWith: ")").value + if !context.isInCustomCharacterClass && src.tryEat(sequence: "(?#") { + return try src.lexUntil(eating: ")").value } if context.experimentalComments, src.tryEat(sequence: "/*") { - return try src.expectQuoted(endingWith: "*/").value + return try src.lexUntil(eating: "*/").value } if context.endOfLineComments, src.tryEat("#") { // Try eat until we either exhaust the input, or hit a newline. Note @@ -647,7 +681,7 @@ extension Source { /// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set mutating func lexNonSemanticWhitespace( context: ParsingContext - ) throws -> AST.Trivia? { + ) -> AST.Trivia? { guard context.ignoreWhitespace else { return nil } // FIXME: PCRE only treats space and tab characters as whitespace when @@ -679,7 +713,7 @@ extension Source { if let comment = try lexComment(context: context) { return comment } - if let whitespace = try lexNonSemanticWhitespace(context: context) { + if let whitespace = lexNonSemanticWhitespace(context: context) { return whitespace } return nil @@ -1158,8 +1192,7 @@ extension Source { } } - mutating func lexCustomCCStart( - ) throws -> Located? { + mutating func lexCustomCCStart() -> Located? { recordLoc { src in // Make sure we don't have a POSIX character property. This may require // walking to its ending to make sure we have a closing ':]', as otherwise @@ -1220,8 +1253,9 @@ extension Source { private func canLexPOSIXCharacterProperty() -> Bool { do { - var src = self - return try src.lexPOSIXCharacterProperty() != nil + return try lookahead { src in + try src.lexPOSIXCharacterProperty() != nil + } } catch { // We want to tend on the side of lexing a POSIX character property, so // even if it is invalid in some way (e.g invalid property names), still @@ -1374,10 +1408,11 @@ extension Source { /// Checks whether a numbered reference can be lexed. private func canLexNumberedReference() -> Bool { - var src = self - _ = src.tryEat(anyOf: "+", "-") - guard let next = src.peek() else { return false } - return RadixKind.decimal.characterFilter(next) + lookahead { src in + _ = src.tryEat(anyOf: "+", "-") + guard let next = src.peek() else { return false } + return RadixKind.decimal.characterFilter(next) + } } /// Eat a named reference up to a given closing delimiter. @@ -1567,53 +1602,55 @@ extension Source { /// Whether we can lex a group-like reference after the specifier '(?'. private func canLexGroupLikeReference() -> Bool { - var src = self - if src.tryEat("P") { - return src.tryEat(anyOf: "=", ">") != nil - } - if src.tryEat(anyOf: "&", "R") != nil { - return true + lookahead { src in + if src.tryEat("P") { + return src.tryEat(anyOf: "=", ">") != nil + } + if src.tryEat(anyOf: "&", "R") != nil { + return true + } + return src.canLexNumberedReference() } - return src.canLexNumberedReference() } private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool { - var src = self - - // See if we can lex a matching option sequence that terminates in ')'. Such - // a sequence is an atom. If an error is thrown, there are invalid elements - // of the matching option sequence. In such a case, we can lex as a group - // and diagnose the invalid group kind. - guard (try? src.lexMatchingOptionSequence(context: context)) != nil else { - return false + lookahead { src in + // See if we can lex a matching option sequence that terminates in ')'. + // Such a sequence is an atom. If an error is thrown, there are invalid + // elements of the matching option sequence. In such a case, we can lex as + // a group and diagnose the invalid group kind. + guard (try? src.lexMatchingOptionSequence(context: context)) != nil else { + return false + } + return src.tryEat(")") } - return src.tryEat(")") } /// Whether a group specifier should be lexed as an atom instead of a group. private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool { - var src = self - guard src.tryEat("(") else { return false } + lookahead { src in + guard src.tryEat("(") else { return false } + + if src.tryEat("?") { + // The start of a reference '(?P=', '(?R', ... + if src.canLexGroupLikeReference() { return true } - if src.tryEat("?") { - // The start of a reference '(?P=', '(?R', ... - if src.canLexGroupLikeReference() { return true } + // The start of a PCRE callout. + if src.tryEat("C") { return true } - // The start of a PCRE callout. - if src.tryEat("C") { return true } + // The start of an Oniguruma 'of-contents' callout. + if src.tryEat("{") { return true } - // The start of an Oniguruma 'of-contents' callout. - if src.tryEat("{") { return true } + // A matching option atom (?x), (?i), ... + if src.canLexMatchingOptionsAsAtom(context: context) { return true } - // A matching option atom (?x), (?i), ... - if src.canLexMatchingOptionsAsAtom(context: context) { return true } + return false + } + // The start of a backreference directive or Oniguruma named callout. + if src.tryEat("*") { return true } return false } - // The start of a backreference directive or Oniguruma named callout. - if src.tryEat("*") { return true } - - return false } /// Consume an escaped atom, starting from after the backslash @@ -1674,9 +1711,10 @@ extension Source { break } - // We only allow unknown escape sequences for non-letter ASCII, and - // non-ASCII whitespace. - guard (char.isASCII && !char.isLetter) || + // We only allow unknown escape sequences for non-letter non-number ASCII, + // and non-ASCII whitespace. + // TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`. + guard (char.isASCII && !char.isLetter && !char.isNumber) || (!char.isASCII && char.isWhitespace) else { throw ParseError.invalidEscape(char) @@ -1981,29 +2019,31 @@ extension Source { case "]": assert(!customCC, "parser should have prevented this") - fallthrough + break - default: return .char(char) + default: + // Reject non-letter non-number non-`\r\n` ASCII characters that have + // multiple scalars. These may be confusable for metacharacters, e.g + // `[\u{301}]` wouldn't be interpreted as a custom character class due + // to the combining accent (assuming it is literal, not `\u{...}`). + let scalars = char.unicodeScalars + if scalars.count > 1 && scalars.first!.isASCII && char != "\r\n" && + !char.isLetter && !char.isNumber { + throw ParseError.confusableCharacter(char) + } + break } + return .char(char) } guard let kind = kind else { return nil } return AST.Atom(kind.value, kind.location) } - /// Try to lex the end of a range in a custom character class, which consists - /// of a '-' character followed by an atom. - mutating func lexCustomCharClassRangeEnd( - context: ParsingContext - ) throws -> (dashLoc: SourceLocation, AST.Atom)? { - // Make sure we don't have a binary operator e.g '--', and the '-' is not - // ending the custom character class (in which case it is literal). - guard peekCCBinOp() == nil, !starts(with: "-]"), - let dash = tryEatWithLoc("-"), - let end = try lexAtom(context: context) - else { - return nil - } - return (dash, end) + /// Try to lex the range operator '-' for a custom character class. + mutating func lexCustomCharacterClassRangeOperator() -> SourceLocation? { + // Eat a '-', making sure we don't have a binary op such as '--'. + guard peekCCBinOp() == nil else { return nil } + return tryEatWithLoc("-") } /// Try to consume a newline sequence matching option kind. diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 112f32358..84957220c 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -222,6 +222,13 @@ extension Parser { result.append(.quote(quote)) continue } + + // Interpolation -> `lexInterpolation` + if let interpolation = try source.lexInterpolation() { + result.append(.interpolation(interpolation)) + continue + } + // Quantification -> QuantOperand Quantifier? if let operand = try parseQuantifierOperand() { if let (amt, kind, trivia) = @@ -422,7 +429,7 @@ extension Parser { } // Check if we have the start of a custom character class '['. - if let cccStart = try source.lexCustomCCStart() { + if let cccStart = source.lexCustomCCStart() { return .customCharacterClass( try parseCustomCharacterClass(cccStart)) } @@ -465,16 +472,6 @@ extension Parser { var members: Array = [] try parseCCCMembers(into: &members) - // If we didn't parse any semantic members, we can eat a ']' character, as - // PCRE, Oniguruma, and ICU forbid empty character classes, and assume an - // initial ']' is literal. - if members.none(\.isSemantic) { - if let loc = source.tryEatWithLoc("]") { - members.append(.atom(.init(.char("]"), loc))) - try parseCCCMembers(into: &members) - } - } - // If we have a binary set operator, parse it and the next members. Note // that this means we left associate for a chain of operators. // TODO: We may want to diagnose and require users to disambiguate, at least @@ -488,16 +485,7 @@ extension Parser { throw Source.LocatedError( ParseError.expectedCustomCharacterClassMembers, start.location) } - - // If we're done, bail early - let setOp = Member.setOperation(members, binOp, rhs) - if source.tryEat("]") { - return CustomCC( - start, [setOp], loc(start.location.start)) - } - - // Otherwise it's just another member to accumulate - members = [setOp] + members = [.setOperation(members, binOp, rhs)] } if members.none(\.isSemantic) { throw Source.LocatedError( @@ -507,45 +495,72 @@ extension Parser { return CustomCC(start, members, loc(start.location.start)) } + mutating func parseCCCMember() throws -> CustomCC.Member? { + guard !source.isEmpty && source.peek() != "]" && source.peekCCBinOp() == nil + else { return nil } + + // Nested custom character class. + if let cccStart = source.lexCustomCCStart() { + return .custom(try parseCustomCharacterClass(cccStart)) + } + + // Quoted sequence. + if let quote = try source.lexQuote(context: context) { + return .quote(quote) + } + + // Lex triva if we're allowed. + if let trivia = try source.lexTrivia(context: context) { + return .trivia(trivia) + } + + if let atom = try source.lexAtom(context: context) { + return .atom(atom) + } + return nil + } + mutating func parseCCCMembers( into members: inout Array ) throws { // Parse members until we see the end of the custom char class or an // operator. - while !source.isEmpty && source.peek() != "]" && - source.peekCCBinOp() == nil { - - // Nested custom character class. - if let cccStart = try source.lexCustomCCStart() { - members.append(.custom(try parseCustomCharacterClass(cccStart))) - continue - } - - // Quoted sequence. - if let quote = try source.lexQuote(context: context) { - members.append(.quote(quote)) - continue - } - - // Lex non-semantic whitespace if we're allowed. - // TODO: ICU allows end-of-line comments in custom character classes, - // which we ought to support if we want to support multi-line regex. - if let trivia = try source.lexNonSemanticWhitespace(context: context) { - members.append(.trivia(trivia)) - continue - } + while let member = try parseCCCMember() { + members.append(member) + + // If we have an atom, we can try to parse a character class range. Each + // time we parse a component of the range, we append to `members` in case + // it ends up not being a range, and we bail. If we succeed in parsing, we + // remove the intermediate members. + if case .atom(let lhs) = member { + let membersBeforeRange = members.count - 1 + + while let t = try source.lexTrivia(context: context) { + members.append(.trivia(t)) + } - guard let atom = try source.lexAtom(context: context) else { break } + guard let dash = source.lexCustomCharacterClassRangeOperator() else { + continue + } + // If we can't parse a range, '-' becomes literal, e.g `[6-]`. + members.append(.atom(.init(.char("-"), dash))) - // Range between atoms. - if let (dashLoc, rhs) = - try source.lexCustomCharClassRangeEnd(context: context) { - members.append(.range(.init(atom, dashLoc, rhs))) - continue + while let t = try source.lexTrivia(context: context) { + members.append(.trivia(t)) + } + guard let rhs = try parseCCCMember() else { continue } + members.append(rhs) + + guard case let .atom(rhs) = rhs else { continue } + + // We've successfully parsed an atom LHS and RHS, so form a range, + // collecting the trivia we've parsed, and replacing the members that + // would have otherwise been added to the custom character class. + let rangeMemberCount = members.count - membersBeforeRange + let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia) + members.removeLast(rangeMemberCount) + members.append(.range(.init(lhs, dash, rhs, trivia: trivia))) } - - members.append(.atom(atom)) - continue } } } diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 9d5ae4576..83c014d2a 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -72,20 +72,20 @@ extension RegexValidator { } func validateReference(_ ref: AST.Reference) throws { + if let recLevel = ref.recursionLevel { + throw error(.unsupported("recursion level"), at: recLevel.location) + } switch ref.kind { case .absolute(let i): guard i <= captures.captures.count else { throw error(.invalidReference(i), at: ref.innerLoc) } + case .named(let name): + guard captures.hasCapture(named: name) else { + throw error(.invalidNamedReference(name), at: ref.innerLoc) + } case .relative: throw error(.unsupported("relative capture reference"), at: ref.innerLoc) - case .named: - // TODO: This could be implemented by querying the capture list for an - // index. - throw error(.unsupported("named capture reference"), at: ref.innerLoc) - } - if let recLevel = ref.recursionLevel { - throw error(.unsupported("recursion level"), at: recLevel.location) } } @@ -127,8 +127,8 @@ extension RegexValidator { _ prop: Unicode.BinaryProperty, at loc: SourceLocation ) throws { switch prop { - case .asciiHexDigit, .alphabetic, .bidiMirrored, .cased, .caseIgnorable, - .changesWhenCasefolded, .changesWhenCasemapped, + case .asciiHexDigit, .alphabetic, .bidiControl, .bidiMirrored, .cased, + .caseIgnorable, .changesWhenCasefolded, .changesWhenCasemapped, .changesWhenNFKCCasefolded, .changesWhenLowercased, .changesWhenTitlecased, .changesWhenUppercased, .dash, .deprecated, .defaultIgnorableCodePoint, .diacratic, .extender, @@ -150,7 +150,7 @@ extension RegexValidator { case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: throw error(.deprecatedUnicode(prop.rawValue.quoted), at: loc) - case .bidiControl, .compositionExclusion, .emojiComponent, + case .compositionExclusion, .emojiComponent, .extendedPictographic, .graphemeLink, .hyphen, .otherAlphabetic, .otherDefaultIgnorableCodePoint, .otherGraphemeExtended, .otherIDContinue, .otherIDStart, .otherLowercase, .otherMath, @@ -169,12 +169,14 @@ extension RegexValidator { case .binary(let b, _): try validateBinaryProperty(b, at: loc) case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script, - .scriptExtension: + .scriptExtension, .age, .numericType, .numericValue, .mapping, .ccc: break case .pcreSpecial: throw error(.unsupported("PCRE property"), at: loc) - case .onigurumaSpecial: + case .block: throw error(.unsupported("Unicode block property"), at: loc) + case .javaSpecial: + throw error(.unsupported("Java property"), at: loc) } } @@ -395,6 +397,11 @@ extension RegexValidator { // These are Oniguruma specific. throw error(.unsupported("absent function"), at: a.location) + case .interpolation(let i): + // This is currently rejected in the parser for better diagnostics, but + // reject here too until we get runtime support. + throw error(.unsupported("interpolation"), at: i.location) + case .quote, .trivia, .empty: break } diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index b8937d518..10e50d712 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -101,6 +101,10 @@ extension AST.Trivia { } } +extension AST.Interpolation { + public var _dumpBase: String { "interpolation <\(contents)>" } +} + extension AST.Empty { public var _dumpBase: String { "" } } @@ -305,7 +309,7 @@ extension AST.CustomCharacterClass: _ASTNode { // comparisons of dumped output in tests. // TODO: We should eventually have some way of filtering out trivia for // tests, so that it can appear in regular dumps. - return "customCharacterClass(\(strippingTriviaShallow.members))" + return "customCharacterClass(inverted: \(isInverted), \(strippingTriviaShallow.members))" } } diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift index 59c0cc04a..ac553a115 100644 --- a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift @@ -97,6 +97,9 @@ extension PrettyPrinter { case let .trivia(t): output(t._canonicalBase) + case let .interpolation(i): + output(i._canonicalBase) + case let .atom(a): output(a._canonicalBase) @@ -178,6 +181,12 @@ extension AST.Quote { } } +extension AST.Interpolation { + var _canonicalBase: String { + "<{\(contents)}>" + } +} + extension AST.Group.Kind { var _canonicalBase: String { switch self { diff --git a/Sources/_RegexParser/Utility/MissingUnicode.swift b/Sources/_RegexParser/Utility/MissingUnicode.swift index b1a4a07ff..c61c78c46 100644 --- a/Sources/_RegexParser/Utility/MissingUnicode.swift +++ b/Sources/_RegexParser/Utility/MissingUnicode.swift @@ -19,8 +19,7 @@ extension Unicode { // other script types. /// Character script types. - @frozen - public enum Script: String, Hashable { + public enum Script: String, Hashable, CaseIterable { case adlam = "Adlam" case ahom = "Ahom" case anatolianHieroglyphs = "Anatolian_Hieroglyphs" @@ -188,8 +187,7 @@ extension Unicode { /// POSIX character properties not already covered by general categories or /// binary properties. - @frozen - public enum POSIXProperty: String, Hashable { + public enum POSIXProperty: String, Hashable, CaseIterable { case alnum = "alnum" case blank = "blank" case graph = "graph" @@ -206,8 +204,7 @@ extension Unicode { /// Unicode.GeneralCategory + cases for "meta categories" such as "L", which /// encompasses Lu | Ll | Lt | Lm | Lo. - @frozen - public enum ExtendedGeneralCategory: String, Hashable { + public enum ExtendedGeneralCategory: String, Hashable, CaseIterable { case other = "C" case control = "Cc" case format = "Cf" @@ -257,8 +254,7 @@ extension Unicode { /// A list of Unicode properties that can either be true or false. /// /// https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt - @frozen - public enum BinaryProperty: String, Hashable { + public enum BinaryProperty: String, Hashable, CaseIterable { case asciiHexDigit = "ASCII_Hex_Digit" case alphabetic = "Alphabetic" case bidiControl = "Bidi_Control" @@ -327,335 +323,653 @@ extension Unicode { case expandsOnNFKC = "Expands_On_NFKC" case expandsOnNFKD = "Expands_On_NFKD" } -} - -// TODO: These should become aliases for the Block (blk) Unicode character -// property. -/// Oniguruma properties that are not covered by Unicode spellings. -@frozen -public enum OnigurumaSpecialProperty: String, Hashable { - case inBasicLatin = "In_Basic_Latin" - case inLatin1Supplement = "In_Latin_1_Supplement" - case inLatinExtendedA = "In_Latin_Extended_A" - case inLatinExtendedB = "In_Latin_Extended_B" - case inIPAExtensions = "In_IPA_Extensions" - case inSpacingModifierLetters = "In_Spacing_Modifier_Letters" - case inCombiningDiacriticalMarks = "In_Combining_Diacritical_Marks" - case inGreekAndCoptic = "In_Greek_and_Coptic" - case inCyrillic = "In_Cyrillic" - case inCyrillicSupplement = "In_Cyrillic_Supplement" - case inArmenian = "In_Armenian" - case inHebrew = "In_Hebrew" - case inArabic = "In_Arabic" - case inSyriac = "In_Syriac" - case inArabicSupplement = "In_Arabic_Supplement" - case inThaana = "In_Thaana" - case inNKo = "In_NKo" - case inSamaritan = "In_Samaritan" - case inMandaic = "In_Mandaic" - case inSyriacSupplement = "In_Syriac_Supplement" - case inArabicExtendedB = "In_Arabic_Extended_B" - case inArabicExtendedA = "In_Arabic_Extended_A" - case inDevanagari = "In_Devanagari" - case inBengali = "In_Bengali" - case inGurmukhi = "In_Gurmukhi" - case inGujarati = "In_Gujarati" - case inOriya = "In_Oriya" - case inTamil = "In_Tamil" - case inTelugu = "In_Telugu" - case inKannada = "In_Kannada" - case inMalayalam = "In_Malayalam" - case inSinhala = "In_Sinhala" - case inThai = "In_Thai" - case inLao = "In_Lao" - case inTibetan = "In_Tibetan" - case inMyanmar = "In_Myanmar" - case inGeorgian = "In_Georgian" - case inHangulJamo = "In_Hangul_Jamo" - case inEthiopic = "In_Ethiopic" - case inEthiopicSupplement = "In_Ethiopic_Supplement" - case inCherokee = "In_Cherokee" - case inUnifiedCanadianAboriginalSyllabics = "In_Unified_Canadian_Aboriginal_Syllabics" - case inOgham = "In_Ogham" - case inRunic = "In_Runic" - case inTagalog = "In_Tagalog" - case inHanunoo = "In_Hanunoo" - case inBuhid = "In_Buhid" - case inTagbanwa = "In_Tagbanwa" - case inKhmer = "In_Khmer" - case inMongolian = "In_Mongolian" - case inUnifiedCanadianAboriginalSyllabicsExtended = "In_Unified_Canadian_Aboriginal_Syllabics_Extended" - case inLimbu = "In_Limbu" - case inTaiLe = "In_Tai_Le" - case inNewTaiLue = "In_New_Tai_Lue" - case inKhmerSymbols = "In_Khmer_Symbols" - case inBuginese = "In_Buginese" - case inTaiTham = "In_Tai_Tham" - case inCombiningDiacriticalMarksExtended = "In_Combining_Diacritical_Marks_Extended" - case inBalinese = "In_Balinese" - case inSundanese = "In_Sundanese" - case inBatak = "In_Batak" - case inLepcha = "In_Lepcha" - case inOlChiki = "In_Ol_Chiki" - case inCyrillicExtendedC = "In_Cyrillic_Extended_C" - case inGeorgianExtended = "In_Georgian_Extended" - case inSundaneseSupplement = "In_Sundanese_Supplement" - case inVedicExtensions = "In_Vedic_Extensions" - case inPhoneticExtensions = "In_Phonetic_Extensions" - case inPhoneticExtensions_Supplement = "In_Phonetic_Extensions_Supplement" - case inCombiningDiacriticalMarksSupplement = "In_Combining_Diacritical_Marks_Supplement" - case inLatinExtendedAdditional = "In_Latin_Extended_Additional" - case inGreekExtended = "In_Greek_Extended" - case inGeneralPunctuation = "In_General_Punctuation" - case inSuperscriptsandSubscripts = "In_Superscripts_and_Subscripts" - case inCurrencySymbols = "In_Currency_Symbols" - case inCombiningDiacriticalMarksforSymbols = "In_Combining_Diacritical_Marks_for_Symbols" - case inLetterlikeSymbols = "In_Letterlike_Symbols" - case inNumberForms = "In_Number_Forms" - case inArrows = "In_Arrows" - case inMathematicalOperators = "In_Mathematical_Operators" - case inMiscellaneousTechnical = "In_Miscellaneous_Technical" - case inControlPictures = "In_Control_Pictures" - case inOpticalCharacterRecognition = "In_Optical_Character_Recognition" - case inEnclosedAlphanumerics = "In_Enclosed_Alphanumerics" - case inBoxDrawing = "In_Box_Drawing" - case inBlockElements = "In_Block_Elements" - case inGeometricShapes = "In_Geometric_Shapes" - case inMiscellaneousSymbols = "In_Miscellaneous_Symbols" - case inDingbats = "In_Dingbats" - case inMiscellaneousMathematicalSymbolsA = "In_Miscellaneous_Mathematical_Symbols_A" - case inSupplementalArrowsA = "In_Supplemental_Arrows_A" - case inBraillePatterns = "In_Braille_Patterns" - case inSupplementalArrowsB = "In_Supplemental_Arrows_B" - case inMiscellaneousMathematicalSymbolsB = "In_Miscellaneous_Mathematical_Symbols_B" - case inSupplementalMathematicalOperators = "In_Supplemental_Mathematical_Operators" - case inMiscellaneousSymbolsAndArrows = "In_Miscellaneous_Symbols_and_Arrows" - case inGlagolitic = "In_Glagolitic" - case inLatinExtendedC = "In_Latin_Extended_C" - case inCoptic = "In_Coptic" - case inGeorgianSupplement = "In_Georgian_Supplement" - case inTifinagh = "In_Tifinagh" - case inEthiopicExtended = "In_Ethiopic_Extended" - case inCyrillicExtendedA = "In_Cyrillic_Extended_A" - case inSupplementalPunctuation = "In_Supplemental_Punctuation" - case inCJKRadicalsSupplement = "In_CJK_Radicals_Supplement" - case inKangxiRadicals = "In_Kangxi_Radicals" - case inIdeographicDescriptionCharacters = "In_Ideographic_Description_Characters" - case inCJKSymbolsAndPunctuation = "In_CJK_Symbols_and_Punctuation" - case inHiragana = "In_Hiragana" - case inKatakana = "In_Katakana" - case inBopomofo = "In_Bopomofo" - case inHangulCompatibilityJamo = "In_Hangul_Compatibility_Jamo" - case inKanbun = "In_Kanbun" - case inBopomofoExtended = "In_Bopomofo_Extended" - case inCJKStrokes = "In_CJK_Strokes" - case inKatakanaPhoneticExtensions = "In_Katakana_Phonetic_Extensions" - case inEnclosedCJKLettersAndMonths = "In_Enclosed_CJK_Letters_and_Months" - case inCJKCompatibility = "In_CJK_Compatibility" - case inCJKUnifiedIdeographsExtensionA = "In_CJK_Unified_Ideographs_Extension_A" - case inYijingHexagramSymbols = "In_Yijing_Hexagram_Symbols" - case inCJKUnifiedIdeographs = "In_CJK_Unified_Ideographs" - case inYiSyllables = "In_Yi_Syllables" - case inYiRadicals = "In_Yi_Radicals" - case inLisu = "In_Lisu" - case inVai = "In_Vai" - case inCyrillicExtendedB = "In_Cyrillic_Extended_B" - case inBamum = "In_Bamum" - case inModifierToneLetters = "In_Modifier_Tone_Letters" - case inLatinExtendedD = "In_Latin_Extended_D" - case inSylotiNagri = "In_Syloti_Nagri" - case inCommonIndicNumberForms = "In_Common_Indic_Number_Forms" - case inPhagsPA = "In_Phags_pa" - case inSaurashtra = "In_Saurashtra" - case inDevanagariExtended = "In_Devanagari_Extended" - case inKayahLi = "In_Kayah_Li" - case inRejang = "In_Rejang" - case inHangulJamoExtendedA = "In_Hangul_Jamo_Extended_A" - case inJavanese = "In_Javanese" - case inMyanmarExtendedB = "In_Myanmar_Extended_B" - case inCham = "In_Cham" - case inMyanmarExtendedA = "In_Myanmar_Extended_A" - case inTaiViet = "In_Tai_Viet" - case inMeeteiMayekExtensions = "In_Meetei_Mayek_Extensions" - case inEthiopicExtendedA = "In_Ethiopic_Extended_A" - case inLatinExtendedE = "In_Latin_Extended_E" - case inCherokeeSupplement = "In_Cherokee_Supplement" - case inMeeteiMayek = "In_Meetei_Mayek" - case inHangulSyllables = "In_Hangul_Syllables" - case inHangulJamoExtendedB = "In_Hangul_Jamo_Extended_B" - case inHighSurrogates = "In_High_Surrogates" - case inHighPrivateUseSurrogates = "In_High_Private_Use_Surrogates" - case inLowSurrogates = "In_Low_Surrogates" - case inPrivateUseArea = "In_Private_Use_Area" - case inCJKCompatibilityIdeographs = "In_CJK_Compatibility_Ideographs" - case inAlphabeticPresentationForms = "In_Alphabetic_Presentation_Forms" - case inArabicPresentationFormsA = "In_Arabic_Presentation_Forms_A" - case inVariationSelectors = "In_Variation_Selectors" - case inVerticalForms = "In_Vertical_Forms" - case inCombiningHalfMarks = "In_Combining_Half_Marks" - case inCJKCompatibilityForms = "In_CJK_Compatibility_Forms" - case inSmallFormVariants = "In_Small_Form_Variants" - case inArabicPresentationFormsB = "In_Arabic_Presentation_Forms_B" - case inHalfwidthAndFullwidthForms = "In_Halfwidth_and_Fullwidth_Forms" - case inSpecials = "In_Specials" - case inLinearBSyllabary = "In_Linear_B_Syllabary" - case inLinearBIdeograms = "In_Linear_B_Ideograms" - case inAegeanNumbers = "In_Aegean_Numbers" - case inAncientGreekNumbers = "In_Ancient_Greek_Numbers" - case inAncientSymbols = "In_Ancient_Symbols" - case inPhaistosDisc = "In_Phaistos_Disc" - case inLycian = "In_Lycian" - case inCarian = "In_Carian" - case inCopticEpactNumbers = "In_Coptic_Epact_Numbers" - case inOldItalic = "In_Old_Italic" - case inGothic = "In_Gothic" - case inOldPermic = "In_Old_Permic" - case inUgaritic = "In_Ugaritic" - case inOldPersian = "In_Old_Persian" - case inDeseret = "In_Deseret" - case inShavian = "In_Shavian" - case inOsmanya = "In_Osmanya" - case inOsage = "In_Osage" - case inElbasan = "In_Elbasan" - case inCaucasianAlbanian = "In_Caucasian_Albanian" - case inVithkuqi = "In_Vithkuqi" - case inLinearA = "In_Linear_A" - case inLatinExtendedF = "In_Latin_Extended_F" - case inCypriotSyllabary = "In_Cypriot_Syllabary" - case inImperialAramaic = "In_Imperial_Aramaic" - case inPalmyrene = "In_Palmyrene" - case inNabataean = "In_Nabataean" - case inHatran = "In_Hatran" - case inPhoenician = "In_Phoenician" - case inLydian = "In_Lydian" - case inMeroiticHieroglyphs = "In_Meroitic_Hieroglyphs" - case inMeroiticCursive = "In_Meroitic_Cursive" - case inKharoshthi = "In_Kharoshthi" - case inOldSouthArabian = "In_Old_South_Arabian" - case inOldNorthArabian = "In_Old_North_Arabian" - case inManichaean = "In_Manichaean" - case inAvestan = "In_Avestan" - case inInscriptionalParthian = "In_Inscriptional_Parthian" - case inInscriptionalPahlavi = "In_Inscriptional_Pahlavi" - case inPsalterPahlavi = "In_Psalter_Pahlavi" - case inOldTurkic = "In_Old_Turkic" - case inOldHungarian = "In_Old_Hungarian" - case inHanifiRohingya = "In_Hanifi_Rohingya" - case inRumiNumeralSymbols = "In_Rumi_Numeral_Symbols" - case inYezidi = "In_Yezidi" - case inOldSogdian = "In_Old_Sogdian" - case inSogdian = "In_Sogdian" - case inOldUyghur = "In_Old_Uyghur" - case inChorasmian = "In_Chorasmian" - case inElymaic = "In_Elymaic" - case inBrahmi = "In_Brahmi" - case inKaithi = "In_Kaithi" - case inSoraSompeng = "In_Sora_Sompeng" - case inChakma = "In_Chakma" - case inMahajani = "In_Mahajani" - case inSharada = "In_Sharada" - case inSinhalaArchaicNumbers = "In_Sinhala_Archaic_Numbers" - case inKhojki = "In_Khojki" - case inMultani = "In_Multani" - case inKhudawadi = "In_Khudawadi" - case inGrantha = "In_Grantha" - case inNewa = "In_Newa" - case inTirhuta = "In_Tirhuta" - case inSiddham = "In_Siddham" - case inModi = "In_Modi" - case inMongolianSupplement = "In_Mongolian_Supplement" - case inTakri = "In_Takri" - case inAhom = "In_Ahom" - case inDogra = "In_Dogra" - case inWarangCiti = "In_Warang_Citi" - case inDivesAkuru = "In_Dives_Akuru" - case inNandinagari = "In_Nandinagari" - case inZanabazarSquare = "In_Zanabazar_Square" - case inSoyombo = "In_Soyombo" - case inUnifiedCanadianAboriginalSyllabicsExtendedA = "In_Unified_Canadian_Aboriginal_Syllabics_Extended_A" - case inPauCinHau = "In_Pau_Cin_Hau" - case inBhaiksuki = "In_Bhaiksuki" - case inMarchen = "In_Marchen" - case inMasaramGondi = "In_Masaram_Gondi" - case inGunjalaGondi = "In_Gunjala_Gondi" - case inMakasar = "In_Makasar" - case inLisuSupplement = "In_Lisu_Supplement" - case inTamilSupplement = "In_Tamil_Supplement" - case inCuneiform = "In_Cuneiform" - case inCuneiformNumbersandPunctuation = "In_Cuneiform_Numbers_and_Punctuation" - case inEarlyDynasticCuneiform = "In_Early_Dynastic_Cuneiform" - case inCyproMinoan = "In_Cypro_Minoan" - case inEgyptianHieroglyphs = "In_Egyptian_Hieroglyphs" - case inEgyptianHieroglyphFormatControls = "In_Egyptian_Hieroglyph_Format_Controls" - case inAnatolianHieroglyphs = "In_Anatolian_Hieroglyphs" - case inBamumSupplement = "In_Bamum_Supplement" - case inMro = "In_Mro" - case inTangsa = "In_Tangsa" - case inBassaVah = "In_Bassa_Vah" - case inPahawhHmong = "In_Pahawh_Hmong" - case inMedefaidrin = "In_Medefaidrin" - case inMiao = "In_Miao" - case inIdeographicSymbolsAndPunctuation = "In_Ideographic_Symbols_and_Punctuation" - case inTangut = "In_Tangut" - case inTangutComponents = "In_Tangut_Components" - case inKhitanSmallScript = "In_Khitan_Small_Script" - case inTangutSupplement = "In_Tangut_Supplement" - case inKanaExtendedB = "In_Kana_Extended_B" - case inKanaSupplement = "In_Kana_Supplement" - case inKanaExtendedA = "In_Kana_Extended_A" - case inSmallKanaExtension = "In_Small_Kana_Extension" - case inNushu = "In_Nushu" - case inDuployan = "In_Duployan" - case inShorthandFormatControls = "In_Shorthand_Format_Controls" - case inZnamennyMusicalNotation = "In_Znamenny_Musical_Notation" - case inByzantineMusicalSymbols = "In_Byzantine_Musical_Symbols" - case inMusicalSymbols = "In_Musical_Symbols" - case inAncientGreekMusicalNotation = "In_Ancient_Greek_Musical_Notation" - case inMayanNumerals = "In_Mayan_Numerals" - case inTaiXuanJingSymbols = "In_Tai_Xuan_Jing_Symbols" - case inCountingRodNumerals = "In_Counting_Rod_Numerals" - case inMathematicalAlphanumericSymbols = "In_Mathematical_Alphanumeric_Symbols" - case inSuttonSignWriting = "In_Sutton_SignWriting" - case inLatinExtendedG = "In_Latin_Extended_G" - case inGlagoliticSupplement = "In_Glagolitic_Supplement" - case inNyiakengPuachueHmong = "In_Nyiakeng_Puachue_Hmong" - case inToto = "In_Toto" - case inWancho = "In_Wancho" - case inEthiopicExtendedB = "In_Ethiopic_Extended_B" - case inMendeKikakui = "In_Mende_Kikakui" - case inAdlam = "In_Adlam" - case inIndicSiyaqNumbers = "In_Indic_Siyaq_Numbers" - case inOttomanSiyaqNumbers = "In_Ottoman_Siyaq_Numbers" - case inArabicMathematicalAlphabeticSymbols = "In_Arabic_Mathematical_Alphabetic_Symbols" - case inMahjongTiles = "In_Mahjong_Tiles" - case inDominoTiles = "In_Domino_Tiles" - case inPlayingCards = "In_Playing_Cards" - case inEnclosedAlphanumericSupplement = "In_Enclosed_Alphanumeric_Supplement" - case inEnclosedIdeographicSupplement = "In_Enclosed_Ideographic_Supplement" - case inMiscellaneousSymbolsandPictographs = "In_Miscellaneous_Symbols_and_Pictographs" - case inEmoticons = "In_Emoticons" - case inOrnamentalDingbats = "In_Ornamental_Dingbats" - case inTransportandMapSymbols = "In_Transport_and_Map_Symbols" - case inAlchemicalSymbols = "In_Alchemical_Symbols" - case inGeometricShapesExtended = "In_Geometric_Shapes_Extended" - case inSupplementalArrowsC = "In_Supplemental_Arrows_C" - case inSupplementalSymbolsAndPictographs = "In_Supplemental_Symbols_and_Pictographs" - case inChessSymbols = "In_Chess_Symbols" - case inSymbolsAndPictographsExtendedA = "In_Symbols_and_Pictographs_Extended_A" - case inSymbolsForLegacyComputing = "In_Symbols_for_Legacy_Computing" - case inCJKUnifiedIdeographsExtensionB = "In_CJK_Unified_Ideographs_Extension_B" - case inCJKUnifiedIdeographsExtensionC = "In_CJK_Unified_Ideographs_Extension_C" - case inCJKUnifiedIdeographsExtensionD = "In_CJK_Unified_Ideographs_Extension_D" - case inCJKUnifiedIdeographsExtensionE = "In_CJK_Unified_Ideographs_Extension_E" - case inCJKUnifiedIdeographsExtensionF = "In_CJK_Unified_Ideographs_Extension_F" - case inCJKCompatibilityIdeographsSupplement = "In_CJK_Compatibility_Ideographs_Supplement" - case inCJKUnifiedIdeographsExtensionG = "In_CJK_Unified_Ideographs_Extension_G" - case inTags = "In_Tags" - case inVariationSelectorsSupplement = "In_Variation_Selectors_Supplement" - case inSupplementaryPrivateUseAreaA = "In_Supplementary_Private_Use_Area_A" - case inSupplementaryPrivateUseAreaB = "In_Supplementary_Private_Use_Area_B" - case inNoBlock = "In_No_Block" + /// A list of unicode character blocks, including `No_Block`. + /// https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt + public enum Block: String, Hashable, CaseIterable { + /// 0000..007F; Basic Latin + case basicLatin = "Basic_Latin" + /// 0080..00FF; Latin-1 Supplement + case latin1Supplement = "Latin_1_Supplement" + /// 0100..017F; Latin Extended-A + case latinExtendedA = "Latin_Extended_A" + /// 0180..024F; Latin Extended-B + case latinExtendedB = "Latin_Extended_B" + /// 0250..02AF; IPA Extensions + case ipaExtensions = "IPA_Extensions" + /// 02B0..02FF; Spacing Modifier Letters + case spacingModifierLetters = "Spacing_Modifier_Letters" + /// 0300..036F; Combining Diacritical Marks + case combiningDiacriticalMarks = "Combining_Diacritical_Marks" + /// 0370..03FF; Greek and Coptic + case greekAndCoptic = "Greek_and_Coptic" + /// 0400..04FF; Cyrillic + case cyrillic = "Cyrillic" + /// 0500..052F; Cyrillic Supplement + case cyrillicSupplement = "Cyrillic_Supplement" + /// 0530..058F; Armenian + case armenian = "Armenian" + /// 0590..05FF; Hebrew + case hebrew = "Hebrew" + /// 0600..06FF; Arabic + case arabic = "Arabic" + /// 0700..074F; Syriac + case syriac = "Syriac" + /// 0750..077F; Arabic Supplement + case arabicSupplement = "Arabic_Supplement" + /// 0780..07BF; Thaana + case thaana = "Thaana" + /// 07C0..07FF; NKo + case nko = "NKo" + /// 0800..083F; Samaritan + case samaritan = "Samaritan" + /// 0840..085F; Mandaic + case mandaic = "Mandaic" + /// 0860..086F; Syriac Supplement + case syriacSupplement = "Syriac_Supplement" + /// 0870..089F; Arabic Extended-B + case arabicExtendedB = "Arabic_Extended_B" + /// 08A0..08FF; Arabic Extended-A + case arabicExtendedA = "Arabic_Extended_A" + /// 0900..097F; Devanagari + case devanagari = "Devanagari" + /// 0980..09FF; Bengali + case bengali = "Bengali" + /// 0A00..0A7F; Gurmukhi + case gurmukhi = "Gurmukhi" + /// 0A80..0AFF; Gujarati + case gujarati = "Gujarati" + /// 0B00..0B7F; Oriya + case oriya = "Oriya" + /// 0B80..0BFF; Tamil + case tamil = "Tamil" + /// 0C00..0C7F; Telugu + case telugu = "Telugu" + /// 0C80..0CFF; Kannada + case kannada = "Kannada" + /// 0D00..0D7F; Malayalam + case malayalam = "Malayalam" + /// 0D80..0DFF; Sinhala + case sinhala = "Sinhala" + /// 0E00..0E7F; Thai + case thai = "Thai" + /// 0E80..0EFF; Lao + case lao = "Lao" + /// 0F00..0FFF; Tibetan + case tibetan = "Tibetan" + /// 1000..109F; Myanmar + case myanmar = "Myanmar" + /// 10A0..10FF; Georgian + case georgian = "Georgian" + /// 1100..11FF; Hangul Jamo + case hangulJamo = "Hangul_Jamo" + /// 1200..137F; Ethiopic + case ethiopic = "Ethiopic" + /// 1380..139F; Ethiopic Supplement + case ethiopicSupplement = "Ethiopic_Supplement" + /// 13A0..13FF; Cherokee + case cherokee = "Cherokee" + /// 1400..167F; Unified Canadian Aboriginal Syllabics + case unifiedCanadianAboriginalSyllabics = "Unified_Canadian_Aboriginal_Syllabics" + /// 1680..169F; Ogham + case ogham = "Ogham" + /// 16A0..16FF; Runic + case runic = "Runic" + /// 1700..171F; Tagalog + case tagalog = "Tagalog" + /// 1720..173F; Hanunoo + case hanunoo = "Hanunoo" + /// 1740..175F; Buhid + case buhid = "Buhid" + /// 1760..177F; Tagbanwa + case tagbanwa = "Tagbanwa" + /// 1780..17FF; Khmer + case khmer = "Khmer" + /// 1800..18AF; Mongolian + case mongolian = "Mongolian" + /// 18B0..18FF; Unified Canadian Aboriginal Syllabics Extended + case unifiedCanadianAboriginalSyllabicsExtended = "Unified_Canadian_Aboriginal_Syllabics_Extended" + /// 1900..194F; Limbu + case limbu = "Limbu" + /// 1950..197F; Tai Le + case taiLe = "Tai_Le" + /// 1980..19DF; New Tai Lue + case newTailue = "New_Tai_Lue" + /// 19E0..19FF; Khmer Symbols + case khmerSymbols = "Khmer_Symbols" + /// 1A00..1A1F; Buginese + case buginese = "Buginese" + /// 1A20..1AAF; Tai Tham + case taiTham = "Tai_Tham" + /// 1AB0..1AFF; Combining Diacritical Marks Extended + case combiningDiacriticalMarksExtended = "Combining_Diacritical_Marks_Extended" + /// 1B00..1B7F; Balinese + case balinese = "Balinese" + /// 1B80..1BBF; Sundanese + case sundanese = "Sundanese" + /// 1BC0..1BFF; Batak + case batak = "Batak" + /// 1C00..1C4F; Lepcha + case lepcha = "Lepcha" + /// 1C50..1C7F; Ol Chiki + case olChiki = "Ol_Chiki" + /// 1C80..1C8F; Cyrillic Extended-C + case cyrillicExtendedC = "Cyrillic_Extended_C" + /// 1C90..1CBF; Georgian Extended + case georgianExtended = "Georgian_Extended" + /// 1CC0..1CCF; Sundanese Supplement + case sundaneseSupplement = "Sundanese_Supplement" + /// 1CD0..1CFF; Vedic Extensions + case vedicExtensions = "Vedic_Extensions" + /// 1D00..1D7F; Phonetic Extensions + case phoneticExtensions = "Phonetic_Extensions" + /// 1D80..1DBF; Phonetic Extensions Supplement + case phoneticExtensionsSupplement = "Phonetic_Extensions_Supplement" + /// 1DC0..1DFF; Combining Diacritical Marks Supplement + case combiningDiacriticalMarksSupplement = "Combining_Diacritical_Marks_Supplement" + /// 1E00..1EFF; Latin Extended Additional + case latinExtendedAdditional = "Latin_Extended_Additional" + /// 1F00..1FFF; Greek Extended + case greekExtended = "Greek_Extended" + /// 2000..206F; General Punctuation + case generalPunctuation = "General_Punctuation" + /// 2070..209F; Superscripts and Subscripts + case superscriptsAndSubscripts = "Superscripts_and_Subscripts" + /// 20A0..20CF; Currency Symbols + case currencySymbols = "Currency_Symbols" + /// 20D0..20FF; Combining Diacritical Marks for Symbols + case combiningDiacriticalMarksForSymbols = "Combining_Diacritical_Marks_for_Symbols" + /// 2100..214F; Letterlike Symbols + case letterLikeSymbols = "Letterlike_Symbols" + /// 2150..218F; Number Forms + case numberForms = "Number_Forms" + /// 2190..21FF; Arrows + case arrows = "Arrows" + /// 2200..22FF; Mathematical Operators + case mathematicalOperators = "Mathematical_Operators" + /// 2300..23FF; Miscellaneous Technical + case miscellaneousTechnical = "Miscellaneous_Technical" + /// 2400..243F; Control Pictures + case controlPictures = "Control_Pictures" + /// 2440..245F; Optical Character Recognition + case opticalCharacterRecognition = "Optical_Character_Recognition" + /// 2460..24FF; Enclosed Alphanumerics + case enclosedAlphanumerics = "Enclosed_Alphanumerics" + /// 2500..257F; Box Drawing + case boxDrawing = "Box_Drawing" + /// 2580..259F; Block Elements + case blockElements = "Block_Elements" + /// 25A0..25FF; Geometric Shapes + case geometricShapes = "Geometric_Shapes" + /// 2600..26FF; Miscellaneous Symbols + case miscellaneousSymbols = "Miscellaneous_Symbols" + /// 2700..27BF; Dingbats + case dingbats = "Dingbats" + /// 27C0..27EF; Miscellaneous Mathematical Symbols-A + case miscellaneousMathematicalSymbolsA = "Miscellaneous_Mathematical_Symbols_A" + /// 27F0..27FF; Supplemental Arrows-A + case supplementalArrowsA = "Supplemental_Arrows_A" + /// 2800..28FF; Braille Patterns + case braillePatterns = "Braille_Patterns" + /// 2900..297F; Supplemental Arrows-B + case supplementalArrowsB = "Supplemental_Arrows_B" + /// 2980..29FF; Miscellaneous Mathematical Symbols-B + case miscellaneousMathematicalSymbolsB = "Miscellaneous_Mathematical_Symbols_B" + /// 2A00..2AFF; Supplemental Mathematical Operators + case supplementalMathematicalOperators = "Supplemental_Mathematical_Operators" + /// 2B00..2BFF; Miscellaneous Symbols and Arrows + case miscellaneousSymbolsAndArrows = "Miscellaneous_Symbols_and_Arrows" + /// 2C00..2C5F; Glagolitic + case glagolitic = "Glagolitic" + /// 2C60..2C7F; Latin Extended-C + case latinExtendedC = "Latin_Extended_C" + /// 2C80..2CFF; Coptic + case coptic = "Coptic" + /// 2D00..2D2F; Georgian Supplement + case georgianSupplement = "Georgian_Supplement" + /// 2D30..2D7F; Tifinagh + case tifinagh = "Tifinagh" + /// 2D80..2DDF; Ethiopic Extended + case ethiopicExtended = "Ethiopic_Extended" + /// 2DE0..2DFF; Cyrillic Extended-A + case cyrillicExtendedA = "Cyrillic_Extended_A" + /// 2E00..2E7F; Supplemental Punctuation + case supplementalPunctuation = "Supplemental_Punctuation" + /// 2E80..2EFF; CJK Radicals Supplement + case cjkRadicalsSupplement = "CJK_Radicals_Supplement" + /// 2F00..2FDF; Kangxi Radicals + case kangxiRadicals = "Kangxi_Radicals" + /// 2FF0..2FFF; Ideographic Description Characters + case ideographicDescriptionCharacters = "Ideographic_Description_Characters" + /// 3000..303F; CJK Symbols and Punctuation + case cjkSymbolsAndPunctuation = "CJK_Symbols_and_Punctuation" + /// 3040..309F; Hiragana + case hiragana = "Hiragana" + /// 30A0..30FF; Katakana + case katakana = "Katakana" + /// 3100..312F; Bopomofo + case bopomofo = "Bopomofo" + /// 3130..318F; Hangul Compatibility Jamo + case hangulCompatibilityJamo = "Hangul_Compatibility_Jamo" + /// 3190..319F; Kanbun + case kanbun = "Kanbun" + /// 31A0..31BF; Bopomofo Extended + case bopomofoExtended = "Bopomofo_Extended" + /// 31C0..31EF; CJK Strokes + case cjkStrokes = "CJK_Strokes" + /// 31F0..31FF; Katakana Phonetic Extensions + case katakanaPhoneticExtensions = "Katakana_Phonetic_Extensions" + /// 3200..32FF; Enclosed CJK Letters and Months + case enclosedCJKLettersAndMonths = "Enclosed_CJK_Letters_and_Months" + /// 3300..33FF; CJK Compatibility + case cjkCompatibility = "CJK_Compatibility" + /// 3400..4DBF; CJK Unified Ideographs Extension A + case cjkUnifiedIdeographsExtensionA = "CJK_Unified_Ideographs_Extension_A" + /// 4DC0..4DFF; Yijing Hexagram Symbols + case yijingHexagramSymbols = "Yijing_Hexagram_Symbols" + /// 4E00..9FFF; CJK Unified Ideographs + case cjkUnifiedIdeographs = "CJK_Unified_Ideographs" + /// A000..A48F; Yi Syllables + case yiSyllables = "Yi_Syllables" + /// A490..A4CF; Yi Radicals + case yiRadicals = "Yi_Radicals" + /// A4D0..A4FF; Lisu + case lisu = "Lisu" + /// A500..A63F; Vai + case vai = "Vai" + /// A640..A69F; Cyrillic Extended-B + case cyrillicExtendedB = "Cyrillic_Extended_B" + /// A6A0..A6FF; Bamum + case bamum = "Bamum" + /// A700..A71F; Modifier Tone Letters + case modifierToneLetters = "Modifier_Tone_Letters" + /// A720..A7FF; Latin Extended-D + case latinExtendedD = "Latin_Extended_D" + /// A800..A82F; Syloti Nagri + case sylotiNagri = "Syloti_Nagri" + /// A830..A83F; Common Indic Number Forms + case commonIndicNumberForms = "Common_Indic_Number_Forms" + /// A840..A87F; Phags-pa + case phagsPA = "Phags_pa" + /// A880..A8DF; Saurashtra + case saurashtra = "Saurashtra" + /// A8E0..A8FF; Devanagari Extended + case devanagariExtended = "Devanagari_Extended" + /// A900..A92F; Kayah Li + case kayahLi = "Kayah_Li" + /// A930..A95F; Rejang + case rejang = "Rejang" + /// A960..A97F; Hangul Jamo Extended-A + case hangulJamoExtendedA = "Hangul_Jamo_Extended_A" + /// A980..A9DF; Javanese + case javanese = "Javanese" + /// A9E0..A9FF; Myanmar Extended-B + case myanmarExtendedB = "Myanmar_Extended_B" + /// AA00..AA5F; Cham + case cham = "Cham" + /// AA60..AA7F; Myanmar Extended-A + case myanmarExtendedA = "Myanmar_Extended_A" + /// AA80..AADF; Tai Viet + case taiViet = "Tai_Viet" + /// AAE0..AAFF; Meetei Mayek Extensions + case meeteiMayekExtensions = "Meetei_Mayek_Extensions" + /// AB00..AB2F; Ethiopic Extended-A + case ethiopicExtendedA = "Ethiopic_Extended_A" + /// AB30..AB6F; Latin Extended-E + case latinExtendedE = "Latin_Extended_E" + /// AB70..ABBF; Cherokee Supplement + case cherokeeSupplement = "Cherokee_Supplement" + /// ABC0..ABFF; Meetei Mayek + case meeteiMayek = "Meetei_Mayek" + /// AC00..D7AF; Hangul Syllables + case hangulSyllables = "Hangul_Syllables" + /// D7B0..D7FF; Hangul Jamo Extended-B + case hangulJamoExtendedB = "Hangul_Jamo_Extended_B" + /// D800..DB7F; High Surrogates + case highSurrogates = "High_Surrogates" + /// DB80..DBFF; High Private Use Surrogates + case highPrivateUseSurrogates = "High_Private_Use_Surrogates" + /// DC00..DFFF; Low Surrogates + case lowSurrogates = "Low_Surrogates" + /// E000..F8FF; Private Use Area + case privateUseArea = "Private_Use_Area" + /// F900..FAFF; CJK Compatibility Ideographs + case cjkCompatibilityIdeographs = "CJK_Compatibility_Ideographs" + /// FB00..FB4F; Alphabetic Presentation Forms + case alphabeticPresentationForms = "Alphabetic_Presentation_Forms" + /// FB50..FDFF; Arabic Presentation Forms-A + case arabicPresentationFormsA = "Arabic_Presentation_Forms_A" + /// FE00..FE0F; Variation Selectors + case variationSelectors = "Variation_Selectors" + /// FE10..FE1F; Vertical Forms + case verticalForms = "Vertical_Forms" + /// FE20..FE2F; Combining Half Marks + case combiningHalfMarks = "Combining_Half_Marks" + /// FE30..FE4F; CJK Compatibility Forms + case cjkcompatibilityForms = "CJK_Compatibility_Forms" + /// FE50..FE6F; Small Form Variants + case smallFormVariants = "Small_Form_Variants" + /// FE70..FEFF; Arabic Presentation Forms-B + case arabicPresentationFormsB = "Arabic_Presentation_Forms_B" + /// FF00..FFEF; Halfwidth and Fullwidth Forms + case halfwidthAndFullwidthForms = "Halfwidth_and_Fullwidth_Forms" + /// FFF0..FFFF; Specials + case specials = "Specials" + /// 10000..1007F; Linear B Syllabary + case linearBSyllabary = "Linear_B_Syllabary" + /// 10080..100FF; Linear B Ideograms + case linearBIdeograms = "Linear_B_Ideograms" + /// 10100..1013F; Aegean Numbers + case aegeanNumbers = "Aegean_Numbers" + /// 10140..1018F; Ancient Greek Numbers + case ancientGreekNumbers = "Ancient_Greek_Numbers" + /// 10190..101CF; Ancient Symbols + case ancientSymbols = "Ancient_Symbols" + /// 101D0..101FF; Phaistos Disc + case phaistosDisc = "Phaistos_Disc" + /// 10280..1029F; Lycian + case lycian = "Lycian" + /// 102A0..102DF; Carian + case carian = "Carian" + /// 102E0..102FF; Coptic Epact Numbers + case copticEpactNumbers = "Coptic_Epact_Numbers" + /// 10300..1032F; Old Italic + case oldItalic = "Old_Italic" + /// 10330..1034F; Gothic + case gothic = "Gothic" + /// 10350..1037F; Old Permic + case oldPermic = "Old_Permic" + /// 10380..1039F; Ugaritic + case ugaritic = "Ugaritic" + /// 103A0..103DF; Old Persian + case oldPersian = "Old_Persian" + /// 10400..1044F; Deseret + case deseret = "Deseret" + /// 10450..1047F; Shavian + case shavian = "Shavian" + /// 10480..104AF; Osmanya + case osmanya = "Osmanya" + /// 104B0..104FF; Osage + case osage = "Osage" + /// 10500..1052F; Elbasan + case elbasan = "Elbasan" + /// 10530..1056F; Caucasian Albanian + case caucasianAlbanian = "Caucasian_Albanian" + /// 10570..105BF; Vithkuqi + case vithkuqi = "Vithkuqi" + /// 10600..1077F; Linear A + case linearA = "Linear_A" + /// 10780..107BF; Latin Extended-F + case latinExtendedF = "Latin_Extended_F" + /// 10800..1083F; Cypriot Syllabary + case cypriotSyllabary = "Cypriot_Syllabary" + /// 10840..1085F; Imperial Aramaic + case imperialAramaic = "Imperial_Aramaic" + /// 10860..1087F; Palmyrene + case palmyrene = "Palmyrene" + /// 10880..108AF; Nabataean + case nabataean = "Nabataean" + /// 108E0..108FF; Hatran + case hatran = "Hatran" + /// 10900..1091F; Phoenician + case phoenician = "Phoenician" + /// 10920..1093F; Lydian + case lydian = "Lydian" + /// 10980..1099F; Meroitic Hieroglyphs + case meroiticHieroglyphs = "Meroitic_Hieroglyphs" + /// 109A0..109FF; Meroitic Cursive + case meroiticCursive = "Meroitic_Cursive" + /// 10A00..10A5F; Kharoshthi + case kharoshthi = "Kharoshthi" + /// 10A60..10A7F; Old South Arabian + case oldSouthArabian = "Old_South_Arabian" + /// 10A80..10A9F; Old North Arabian + case oldNorthArabian = "Old_North_Arabian" + /// 10AC0..10AFF; Manichaean + case manichaean = "Manichaean" + /// 10B00..10B3F; Avestan + case avestan = "Avestan" + /// 10B40..10B5F; Inscriptional Parthian + case inscriptionalParthian = "Inscriptional_Parthian" + /// 10B60..10B7F; Inscriptional Pahlavi + case inscriptionalPahlavi = "Inscriptional_Pahlavi" + /// 10B80..10BAF; Psalter Pahlavi + case psalterPahlavi = "Psalter_Pahlavi" + /// 10C00..10C4F; Old Turkic + case oldTurkic = "Old_Turkic" + /// 10C80..10CFF; Old Hungarian + case oldHungarian = "Old_Hungarian" + /// 10D00..10D3F; Hanifi Rohingya + case hanifiRohingya = "Hanifi_Rohingya" + /// 10E60..10E7F; Rumi Numeral Symbols + case rumiNumeralSymbols = "Rumi_Numeral_Symbols" + /// 10E80..10EBF; Yezidi + case yezidi = "Yezidi" + /// 10F00..10F2F; Old Sogdian + case oldSogdian = "Old_Sogdian" + /// 10F30..10F6F; Sogdian + case sogdian = "Sogdian" + /// 10F70..10FAF; Old Uyghur + case oldUyghur = "Old_Uyghur" + /// 10FB0..10FDF; Chorasmian + case chorasmian = "Chorasmian" + /// 10FE0..10FFF; Elymaic + case elymaic = "Elymaic" + /// 11000..1107F; Brahmi + case brahmi = "Brahmi" + /// 11080..110CF; Kaithi + case kaithi = "Kaithi" + /// 110D0..110FF; Sora Sompeng + case soraSompeng = "Sora_Sompeng" + /// 11100..1114F; Chakma + case chakma = "Chakma" + /// 11150..1117F; Mahajani + case mahajani = "Mahajani" + /// 11180..111DF; Sharada + case sharada = "Sharada" + /// 111E0..111FF; Sinhala Archaic Numbers + case sinhalaArchaicNumbers = "Sinhala_Archaic_Numbers" + /// 11200..1124F; Khojki + case khojki = "Khojki" + /// 11280..112AF; Multani + case multani = "Multani" + /// 112B0..112FF; Khudawadi + case khudawadi = "Khudawadi" + /// 11300..1137F; Grantha + case grantha = "Grantha" + /// 11400..1147F; Newa + case newa = "Newa" + /// 11480..114DF; Tirhuta + case tirhuta = "Tirhuta" + /// 11580..115FF; Siddham + case siddham = "Siddham" + /// 11600..1165F; Modi + case modi = "Modi" + /// 11660..1167F; Mongolian Supplement + case mongolianSupplement = "Mongolian_Supplement" + /// 11680..116CF; Takri + case takri = "Takri" + /// 11700..1174F; Ahom + case ahom = "Ahom" + /// 11800..1184F; Dogra + case dogra = "Dogra" + /// 118A0..118FF; Warang Citi + case warangCiti = "Warang_Citi" + /// 11900..1195F; Dives Akuru + case divesAkuru = "Dives_Akuru" + /// 119A0..119FF; Nandinagari + case nandinagari = "Nandinagari" + /// 11A00..11A4F; Zanabazar Square + case zanabazarSquare = "Zanabazar_Square" + /// 11A50..11AAF; Soyombo + case soyombo = "Soyombo" + /// 11AB0..11ABF; Unified Canadian Aboriginal Syllabics Extended-A + case unifiedCanadianAboriginalSyllabicsExtendedA = "Unified_Canadian_Aboriginal_Syllabics_Extended_A" + /// 11AC0..11AFF; Pau Cin Hau + case pauCinHau = "Pau_Cin_Hau" + /// 11C00..11C6F; Bhaiksuki + case bhaiksuki = "Bhaiksuki" + /// 11C70..11CBF; Marchen + case marchen = "Marchen" + /// 11D00..11D5F; Masaram Gondi + case masaramGondi = "Masaram_Gondi" + /// 11D60..11DAF; Gunjala Gondi + case gunjalaGondi = "Gunjala_Gondi" + /// 11EE0..11EFF; Makasar + case makasar = "Makasar" + /// 11FB0..11FBF; Lisu Supplement + case lisuSupplement = "Lisu_Supplement" + /// 11FC0..11FFF; Tamil Supplement + case tamilSupplement = "Tamil_Supplement" + /// 12000..123FF; Cuneiform + case cuneiform = "Cuneiform" + /// 12400..1247F; Cuneiform Numbers and Punctuation + case cuneiformNumbersAndPunctuation = "Cuneiform_Numbers_and_Punctuation" + /// 12480..1254F; Early Dynastic Cuneiform + case earlyDynasticCuneiform = "Early_Dynastic_Cuneiform" + /// 12F90..12FFF; Cypro-Minoan + case cyproMinoan = "Cypro_Minoan" + /// 13000..1342F; Egyptian Hieroglyphs + case egyptianHieroglyphs = "Egyptian_Hieroglyphs" + /// 13430..1343F; Egyptian Hieroglyph Format Controls + case egyptianHieroglyphFormatControls = "Egyptian_Hieroglyph_Format_Controls" + /// 14400..1467F; Anatolian Hieroglyphs + case anatolianHieroglyphs = "Anatolian_Hieroglyphs" + /// 16800..16A3F; Bamum Supplement + case bamumSupplement = "Bamum_Supplement" + /// 16A40..16A6F; Mro + case mro = "Mro" + /// 16A70..16ACF; Tangsa + case tangsa = "Tangsa" + /// 16AD0..16AFF; Bassa Vah + case bassaVah = "Bassa_Vah" + /// 16B00..16B8F; Pahawh Hmong + case pahawhHmong = "Pahawh_Hmong" + /// 16E40..16E9F; Medefaidrin + case medefaidrin = "Medefaidrin" + /// 16F00..16F9F; Miao + case miao = "Miao" + /// 16FE0..16FFF; Ideographic Symbols and Punctuation + case ideographicSymbolsAndPunctuation = "Ideographic_Symbols_and_Punctuation" + /// 17000..187FF; Tangut + case tangut = "Tangut" + /// 18800..18AFF; Tangut Components + case tangutComponents = "Tangut_Components" + /// 18B00..18CFF; Khitan Small Script + case khitanSmallScript = "Khitan_Small_Script" + /// 18D00..18D7F; Tangut Supplement + case tangutSupplement = "Tangut_Supplement" + /// 1AFF0..1AFFF; Kana Extended-B + case kanaExtendedB = "Kana_Extended_B" + /// 1B000..1B0FF; Kana Supplement + case kanaSupplement = "Kana_Supplement" + /// 1B100..1B12F; Kana Extended-A + case kanaExtendedA = "Kana_Extended_A" + /// 1B130..1B16F; Small Kana Extension + case smallKanaExtension = "Small_Kana_Extension" + /// 1B170..1B2FF; Nushu + case nushu = "Nushu" + /// 1BC00..1BC9F; Duployan + case duployan = "Duployan" + /// 1BCA0..1BCAF; Shorthand Format Controls + case shorthandFormatControls = "Shorthand_Format_Controls" + /// 1CF00..1CFCF; Znamenny Musical Notation + case znamennyMusicalNotation = "Znamenny_Musical_Notation" + /// 1D000..1D0FF; Byzantine Musical Symbols + case byzantineMusicalSymbols = "Byzantine_Musical_Symbols" + /// 1D100..1D1FF; Musical Symbols + case musicalSymbols = "Musical_Symbols" + /// 1D200..1D24F; Ancient Greek Musical Notation + case ancientGreekMusicalNotation = "Ancient_Greek_Musical_Notation" + /// 1D2E0..1D2FF; Mayan Numerals + case mayanNumerals = "Mayan_Numerals" + /// 1D300..1D35F; Tai Xuan Jing Symbols + case taiXuanJingSymbols = "Tai_Xuan_Jing_Symbols" + /// 1D360..1D37F; Counting Rod Numerals + case countingRodNumerals = "Counting_Rod_Numerals" + /// 1D400..1D7FF; Mathematical Alphanumeric Symbols + case mathematicalAlphanumericSymbols = "Mathematical_Alphanumeric_Symbols" + /// 1D800..1DAAF; Sutton SignWriting + case suttonSignwriting = "Sutton_SignWriting" + /// 1DF00..1DFFF; Latin Extended-G + case latinExtendedG = "Latin_Extended_G" + /// 1E000..1E02F; Glagolitic Supplement + case glagoliticSupplement = "Glagolitic_Supplement" + /// 1E100..1E14F; Nyiakeng Puachue Hmong + case nyiakengPuachueHmong = "Nyiakeng_Puachue_Hmong" + /// 1E290..1E2BF; Toto + case toto = "Toto" + /// 1E2C0..1E2FF; Wancho + case wancho = "Wancho" + /// 1E7E0..1E7FF; Ethiopic Extended-B + case ethiopicExtendedB = "Ethiopic_Extended_B" + /// 1E800..1E8DF; Mende Kikakui + case mendeKikakui = "Mende_Kikakui" + /// 1E900..1E95F; Adlam + case adlam = "Adlam" + /// 1EC70..1ECBF; Indic Siyaq Numbers + case indicSiyaqNumbers = "Indic_Siyaq_Numbers" + /// 1ED00..1ED4F; Ottoman Siyaq Numbers + case ottomanSiyaqNumbers = "Ottoman_Siyaq_Numbers" + /// 1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols + case arabicMathematicalAlphabeticSymbols = "Arabic_Mathematical_Alphabetic_Symbols" + /// 1F000..1F02F; Mahjong Tiles + case mahjongTiles = "Mahjong_Tiles" + /// 1F030..1F09F; Domino Tiles + case dominoTiles = "Domino_Tiles" + /// 1F0A0..1F0FF; Playing Cards + case playingCards = "Playing_Cards" + /// 1F100..1F1FF; Enclosed Alphanumeric Supplement + case enclosedAlphanumericSupplement = "Enclosed_Alphanumeric_Supplement" + /// 1F200..1F2FF; Enclosed Ideographic Supplement + case enclosedIdeographicSupplement = "Enclosed_Ideographic_Supplement" + /// 1F300..1F5FF; Miscellaneous Symbols and Pictographs + case miscellaneousSymbolsandPictographs = "Miscellaneous_Symbols_and_Pictographs" + /// 1F600..1F64F; Emoticons + case emoticons = "Emoticons" + /// 1F650..1F67F; Ornamental Dingbats + case ornamentalDingbats = "Ornamental_Dingbats" + /// 1F680..1F6FF; Transport and Map Symbols + case transportAndMapSymbols = "Transport_and_Map_Symbols" + /// 1F700..1F77F; Alchemical Symbols + case alchemicalSymbols = "Alchemical_Symbols" + /// 1F780..1F7FF; Geometric Shapes Extended + case geometricShapesExtended = "Geometric_Shapes_Extended" + /// 1F800..1F8FF; Supplemental Arrows-C + case supplementalArrowsC = "Supplemental_Arrows_C" + /// 1F900..1F9FF; Supplemental Symbols and Pictographs + case supplementalSymbolsAndPictographs = "Supplemental_Symbols_and_Pictographs" + /// 1FA00..1FA6F; Chess Symbols + case chessSymbols = "Chess_Symbols" + /// 1FA70..1FAFF; Symbols and Pictographs Extended-A + case symbolsAndPictographsExtendedA = "Symbols_and_Pictographs_Extended_A" + /// 1FB00..1FBFF; Symbols for Legacy Computing + case symbolsForLegacyComputing = "Symbols_for_Legacy_Computing" + /// 20000..2A6DF; CJK Unified Ideographs Extension B + case cjkUnifiedIdeographsExtensionB = "CJK_Unified_Ideographs_Extension_B" + /// 2A700..2B73F; CJK Unified Ideographs Extension C + case cjkUnifiedIdeographsExtensionC = "CJK_Unified_Ideographs_Extension_C" + /// 2B740..2B81F; CJK Unified Ideographs Extension D + case cjkUnifiedIdeographsExtensionD = "CJK_Unified_Ideographs_Extension_D" + /// 2B820..2CEAF; CJK Unified Ideographs Extension E + case cjkUnifiedIdeographsExtensionE = "CJK_Unified_Ideographs_Extension_E" + /// 2CEB0..2EBEF; CJK Unified Ideographs Extension F + case cjkUnifiedIdeographsExtensionF = "CJK_Unified_Ideographs_Extension_F" + /// 2F800..2FA1F; CJK Compatibility Ideographs Supplement + case cjkCompatibilityIdeographsSupplement = "CJK_Compatibility_Ideographs_Supplement" + /// 30000..3134F; CJK Unified Ideographs Extension G + case cjkUnifiedIdeographsExtensionG = "CJK_Unified_Ideographs_Extension_G" + /// E0000..E007F; Tags + case tags = "Tags" + /// E0100..E01EF; Variation Selectors Supplement + case variationSelectorsSupplement = "Variation_Selectors_Supplement" + /// F0000..FFFFF; Supplementary Private Use Area-A + case supplementaryPrivateUseAreaA = "Supplementary_Private_Use_Area_A" + /// 100000..10FFFF; Supplementary Private Use Area-B + case supplementaryPrivateUseAreaB = "Supplementary_Private_Use_Area_B" + /// @missing: 0000..10FFFF; No_Block + case noBlock = "No_Block" + } } extension Character { diff --git a/Sources/_RegexParser/Utility/TypeConstruction.swift b/Sources/_RegexParser/Utility/TypeConstruction.swift index 524b24917..e368d3513 100644 --- a/Sources/_RegexParser/Utility/TypeConstruction.swift +++ b/Sources/_RegexParser/Utility/TypeConstruction.swift @@ -17,19 +17,7 @@ // const Metadata * const *elements, // const char *labels, // const ValueWitnessTable *proposedWitnesses); -// -// SWIFT_RUNTIME_EXPORT SWIFT_CC(swift) -// MetadataResponse -// swift_getTupleTypeMetadata2(MetadataRequest request, -// const Metadata *elt0, const Metadata *elt1, -// const char *labels, -// const ValueWitnessTable *proposedWitnesses); -// SWIFT_RUNTIME_EXPORT SWIFT_CC(swift) -// MetadataResponse -// swift_getTupleTypeMetadata3(MetadataRequest request, -// const Metadata *elt0, const Metadata *elt1, -// const Metadata *elt2, const char *labels, -// const ValueWitnessTable *proposedWitnesses); + @_silgen_name("swift_getTupleTypeMetadata") private func swift_getTupleTypeMetadata( @@ -40,31 +28,13 @@ private func swift_getTupleTypeMetadata( proposedWitnesses: UnsafeRawPointer? ) -> (value: Any.Type, state: Int) -@_silgen_name("swift_getTupleTypeMetadata2") -private func swift_getTupleTypeMetadata2( - request: Int, - element1: Any.Type, - element2: Any.Type, - labels: UnsafePointer?, - proposedWitnesses: UnsafeRawPointer? -) -> (value: Any.Type, state: Int) - -@_silgen_name("swift_getTupleTypeMetadata3") -private func swift_getTupleTypeMetadata3( - request: Int, - element1: Any.Type, - element2: Any.Type, - element3: Any.Type, - labels: UnsafePointer?, - proposedWitnesses: UnsafeRawPointer? -) -> (value: Any.Type, state: Int) - public enum TypeConstruction { /// Returns a tuple metatype of the given element types. public static func tupleType< ElementTypes: BidirectionalCollection >( - of elementTypes: __owned ElementTypes + of elementTypes: __owned ElementTypes, + labels: String? = nil ) -> Any.Type where ElementTypes.Element == Any.Type { // From swift/ABI/Metadata.h: // template @@ -78,39 +48,50 @@ public enum TypeConstruction { let elementCountFlag = 0x0000FFFF assert(elementTypes.count != 1, "A one-element tuple is not a realistic Swift type") assert(elementTypes.count <= elementCountFlag, "Tuple size exceeded \(elementCountFlag)") - switch elementTypes.count { - case 2: - return swift_getTupleTypeMetadata2( - request: 0, - element1: elementTypes[elementTypes.startIndex], - element2: elementTypes[elementTypes.index(elementTypes.startIndex, offsetBy: 1)], - labels: nil, - proposedWitnesses: nil).value - case 3: - return swift_getTupleTypeMetadata3( - request: 0, - element1: elementTypes[elementTypes.startIndex], - element2: elementTypes[elementTypes.index(elementTypes.startIndex, offsetBy: 1)], - element3: elementTypes[elementTypes.index(elementTypes.startIndex, offsetBy: 2)], - labels: nil, - proposedWitnesses: nil).value - default: - let result = elementTypes.withContiguousStorageIfAvailable { elementTypesBuffer in - swift_getTupleTypeMetadata( + + var flags = elementTypes.count + + // If we have labels to provide, then say the label pointer is not constant + // because the lifetime of said pointer will only be vaild for the lifetime + // of the 'swift_getTupleTypeMetadata' call. If we don't have labels, then + // our label pointer will be empty and constant. + if labels != nil { + // Has non constant labels + flags |= 0x10000 + } + + let result = elementTypes.withContiguousStorageIfAvailable { elementTypesBuffer in + if let labels = labels { + return labels.withCString { labelsPtr in + swift_getTupleTypeMetadata( + request: 0, + flags: flags, + elements: elementTypesBuffer.baseAddress, + labels: labelsPtr, + proposedWitnesses: nil + ) + } + } else { + return swift_getTupleTypeMetadata( request: 0, - flags: elementTypesBuffer.count, + flags: flags, elements: elementTypesBuffer.baseAddress, labels: nil, - proposedWitnesses: nil).value + proposedWitnesses: nil + ) } - guard let result = result else { - fatalError(""" - The collection of element types does not support an internal representation of - contiguous storage - """) - } - return result } + + guard let result = result else { + fatalError( + """ + The collection of element types does not support an internal representation of + contiguous storage + """ + ) + } + + return result.value } /// Creates a type-erased tuple with the given elements. diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift index 2a1ef72a2..020ea8208 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift @@ -12,10 +12,10 @@ // MARK: `CollectionSearcher` algorithms extension Collection { - func contains( + func _contains( _ searcher: Searcher ) -> Bool where Searcher.Searched == Self { - firstRange(of: searcher) != nil + _firstRange(of: searcher) != nil } } @@ -27,6 +27,7 @@ extension Collection where Element: Equatable { /// - Parameter other: A sequence to search for within this collection. /// - Returns: `true` if the collection contains the specified sequence, /// otherwise `false`. + @_disfavoredOverload @available(SwiftStdlib 5.7, *) public func contains(_ other: C) -> Bool where C.Element == Element @@ -36,7 +37,7 @@ extension Collection where Element: Equatable { } extension BidirectionalCollection where Element: Comparable { - func contains(_ other: C) -> Bool + func _contains(_ other: C) -> Bool where C.Element == Element { if #available(SwiftStdlib 5.7, *) { @@ -49,11 +50,13 @@ extension BidirectionalCollection where Element: Comparable { // Overload breakers extension StringProtocol { + @_disfavoredOverload @available(SwiftStdlib 5.7, *) public func contains(_ other: String) -> Bool { firstRange(of: other) != nil } + @_disfavoredOverload @available(SwiftStdlib 5.7, *) public func contains(_ other: Substring) -> Bool { firstRange(of: other) != nil @@ -68,8 +71,9 @@ extension BidirectionalCollection where SubSequence == Substring { /// - Parameter regex: A regex to search for within this collection. /// - Returns: `true` if the regex was found in the collection, otherwise /// `false`. + @_disfavoredOverload @available(SwiftStdlib 5.7, *) public func contains(_ regex: R) -> Bool { - contains(RegexConsumer(regex)) + _contains(RegexConsumer(regex)) } } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift index 42703827e..30c2fac92 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift @@ -12,7 +12,7 @@ // MARK: `CollectionSearcher` algorithms extension Collection { - func firstRange( + func _firstRange( of searcher: S ) -> Range? where S.Searched == Self { var state = searcher.state(for: self, in: startIndex..( + func _lastRange( of searcher: S ) -> Range? where S.BackwardSearched == Self { var state = searcher.backwardState(for: self, in: startIndex..(of regex: R) -> Range? { - firstRange(of: RegexConsumer(regex)) + _firstRange(of: RegexConsumer(regex)) } @available(SwiftStdlib 5.7, *) - func lastRange(of regex: R) -> Range? { - lastRange(of: RegexConsumer(regex)) + func _lastRange(of regex: R) -> Range? { + _lastRange(of: RegexConsumer(regex)) } } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift index 33a9748ac..578c499d1 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift @@ -157,7 +157,7 @@ extension ReversedRangesCollection: Sequence { // MARK: `CollectionSearcher` algorithms extension Collection { - func ranges( + func _ranges( of searcher: S ) -> RangesCollection where S.Searched == Self { RangesCollection(base: self, searcher: searcher) @@ -165,7 +165,7 @@ extension Collection { } extension BidirectionalCollection { - func rangesFromBack( + func _rangesFromBack( of searcher: S ) -> ReversedRangesCollection where S.BackwardSearched == Self { ReversedRangesCollection(base: self, searcher: searcher) @@ -175,10 +175,10 @@ extension BidirectionalCollection { // MARK: Fixed pattern algorithms extension Collection where Element: Equatable { - func ranges( + func _ranges( of other: C ) -> RangesCollection> where C.Element == Element { - ranges(of: ZSearcher(pattern: Array(other), by: ==)) + _ranges(of: ZSearcher(pattern: Array(other), by: ==)) } // FIXME: Return `some Collection>` for SE-0346 @@ -191,7 +191,7 @@ extension Collection where Element: Equatable { public func ranges( of other: C ) -> [Range] where C.Element == Element { - ranges(of: ZSearcher(pattern: Array(other), by: ==)).map { $0 } + Array(_ranges(of: other)) } } @@ -207,12 +207,12 @@ extension BidirectionalCollection where Element: Equatable { } extension BidirectionalCollection where Element: Comparable { - func ranges( + func _ranges( of other: C ) -> RangesCollection>> where C.Element == Element { - ranges(of: PatternOrEmpty(searcher: TwoWaySearcher(pattern: Array(other)))) + _ranges(of: PatternOrEmpty(searcher: TwoWaySearcher(pattern: Array(other)))) } // FIXME @@ -231,17 +231,17 @@ extension BidirectionalCollection where Element: Comparable { extension BidirectionalCollection where SubSequence == Substring { @available(SwiftStdlib 5.7, *) @_disfavoredOverload - func ranges( + func _ranges( of regex: R ) -> RangesCollection> { - ranges(of: RegexConsumer(regex)) + _ranges(of: RegexConsumer(regex)) } @available(SwiftStdlib 5.7, *) - func rangesFromBack( + func _rangesFromBack( of regex: R ) -> ReversedRangesCollection> { - rangesFromBack(of: RegexConsumer(regex)) + _rangesFromBack(of: RegexConsumer(regex)) } // FIXME: Return `some Collection>` for SE-0346 @@ -251,10 +251,11 @@ extension BidirectionalCollection where SubSequence == Substring { /// - Parameter regex: The regex to search for. /// - Returns: A collection or ranges in the receiver of all occurrences of /// `regex`. Returns an empty collection if `regex` is not found. + @_disfavoredOverload @available(SwiftStdlib 5.7, *) public func ranges( of regex: R ) -> [Range] { - Array(ranges(of: RegexConsumer(regex))) + Array(_ranges(of: regex)) } } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift index 217fb90d6..5c2bc035f 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift @@ -12,7 +12,7 @@ // MARK: `CollectionSearcher` algorithms extension RangeReplaceableCollection { - func replacing( + func _replacing( _ searcher: Searcher, with replacement: Replacement, subrange: Range, @@ -26,7 +26,7 @@ extension RangeReplaceableCollection { var result = Self() result.append(contentsOf: self[..( + func _replacing( _ searcher: Searcher, with replacement: Replacement, maxReplacements: Int = .max ) -> Self where Searcher.Searched == SubSequence, Replacement.Element == Element { - replacing( + _replacing( searcher, with: replacement, subrange: startIndex..( _ searcher: Searcher, with replacement: Replacement, maxReplacements: Int = .max ) where Searcher.Searched == SubSequence, Replacement.Element == Element { - self = replacing( + self = _replacing( searcher, with: replacement, maxReplacements: maxReplacements) @@ -84,7 +84,7 @@ extension RangeReplaceableCollection where Element: Equatable { subrange: Range, maxReplacements: Int = .max ) -> Self where C.Element == Element, Replacement.Element == Element { - replacing( + _replacing( ZSearcher(pattern: Array(other), by: ==), with: replacement, subrange: subrange, @@ -136,37 +136,37 @@ extension RangeReplaceableCollection where Element: Equatable { extension RangeReplaceableCollection where Self: BidirectionalCollection, Element: Comparable { - func replacing( + func _replacing( _ other: C, with replacement: Replacement, subrange: Range, maxReplacements: Int = .max ) -> Self where C.Element == Element, Replacement.Element == Element { - replacing( + _replacing( PatternOrEmpty(searcher: TwoWaySearcher(pattern: Array(other))), with: replacement, subrange: subrange, maxReplacements: maxReplacements) } - func replacing( + func _replacing( _ other: C, with replacement: Replacement, maxReplacements: Int = .max ) -> Self where C.Element == Element, Replacement.Element == Element { - replacing( + _replacing( other, with: replacement, subrange: startIndex..( + mutating func _replace( _ other: C, with replacement: Replacement, maxReplacements: Int = .max ) where C.Element == Element, Replacement.Element == Element { - self = replacing( + self = _replacing( other, with: replacement, subrange: startIndex.., maxReplacements: Int = .max ) -> Self where Replacement.Element == Element { - replacing( + _replacing( RegexConsumer(regex), with: replacement, subrange: subrange, @@ -210,6 +210,7 @@ extension RangeReplaceableCollection where SubSequence == Substring { /// sequence matching `regex` to replace. Default is `Int.max`. /// - Returns: A new collection in which all occurrences of subsequence /// matching `regex` are replaced by `replacement`. + @_disfavoredOverload @available(SwiftStdlib 5.7, *) public func replacing( _ regex: R, diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift index ab465c382..16cc47c39 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift @@ -34,7 +34,7 @@ struct SplitCollection { maxSplits: Int, omittingEmptySubsequences: Bool) { - self.ranges = base.ranges(of: searcher) + self.ranges = base._ranges(of: searcher) self.maxSplits = maxSplits self.omittingEmptySubsequences = omittingEmptySubsequences } @@ -183,7 +183,7 @@ struct ReversedSplitCollection { } init(base: Base, searcher: Searcher) { - self.ranges = base.rangesFromBack(of: searcher) + self.ranges = base._rangesFromBack(of: searcher) } } @@ -307,13 +307,17 @@ extension Collection where Element: Equatable { /// - Parameter separator: The element to be split upon. /// - Returns: A collection of subsequences, split from this collection's /// elements. + @_disfavoredOverload @available(SwiftStdlib 5.7, *) public func split( separator: C, maxSplits: Int = .max, omittingEmptySubsequences: Bool = true ) -> [SubSequence] where C.Element == Element { - Array(split(by: ZSearcher(pattern: Array(separator), by: ==), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences)) + Array(split( + by: ZSearcher(pattern: Array(separator), by: ==), + maxSplits: maxSplits, + omittingEmptySubsequences: omittingEmptySubsequences)) } } @@ -352,6 +356,45 @@ extension BidirectionalCollection where Element: Comparable { // } } +// String split overload breakers +// +// These are underscored and marked as SPI so that the *actual* public overloads +// are only visible in RegexBuilder, to avoid breaking source with the +// standard library's function of the same name that takes a `Character` +// as the separator. *Those* overloads are necessary as tie-breakers between +// the Collection-based and Regex-based `split`s, which in turn are both marked +// @_disfavoredOverload to avoid the wrong overload being selected when a +// collection's element type could be used interchangably with a collection of +// that element (e.g. `Array.split(separator: [])`). + +extension StringProtocol where SubSequence == Substring { + @_spi(RegexBuilder) + @available(SwiftStdlib 5.7, *) + public func _split( + separator: String, + maxSplits: Int = .max, + omittingEmptySubsequences: Bool = true + ) -> [Substring] { + Array(split( + by: ZSearcher(pattern: Array(separator), by: ==), + maxSplits: maxSplits, + omittingEmptySubsequences: omittingEmptySubsequences)) + } + + @_spi(RegexBuilder) + @available(SwiftStdlib 5.7, *) + public func _split( + separator: Substring, + maxSplits: Int = .max, + omittingEmptySubsequences: Bool = true + ) -> [Substring] { + Array(split( + by: ZSearcher(pattern: Array(separator), by: ==), + maxSplits: maxSplits, + omittingEmptySubsequences: omittingEmptySubsequences)) + } +} + // MARK: Regex algorithms @available(SwiftStdlib 5.7, *) @@ -388,6 +431,9 @@ extension BidirectionalCollection where SubSequence == Substring { maxSplits: Int = .max, omittingEmptySubsequences: Bool = true ) -> [SubSequence] { - Array(split(by: RegexConsumer(separator), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences)) + Array(split( + by: RegexConsumer(separator), + maxSplits: maxSplits, + omittingEmptySubsequences: omittingEmptySubsequences)) } } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/StartsWith.swift b/Sources/_StringProcessing/Algorithms/Algorithms/StartsWith.swift index 2f45a734b..c8aaf9126 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/StartsWith.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/StartsWith.swift @@ -12,7 +12,7 @@ // MARK: `CollectionConsumer` algorithms extension Collection { - func starts(with consumer: C) -> Bool + func _starts(with consumer: C) -> Bool where C.Consumed == SubSequence { consumer.consuming(self[...]) != nil @@ -20,7 +20,7 @@ extension Collection { } extension BidirectionalCollection { - func ends(with consumer: C) -> Bool + func _ends(with consumer: C) -> Bool where C.Consumed == SubSequence { consumer.consumingBack(self[...]) != nil @@ -30,18 +30,18 @@ extension BidirectionalCollection { // MARK: Fixed pattern algorithms extension Collection where Element: Equatable { - func starts(with prefix: C) -> Bool + func _starts(with prefix: C) -> Bool where C.Element == Element { - starts(with: FixedPatternConsumer(pattern: prefix)) + _starts(with: FixedPatternConsumer(pattern: prefix)) } } extension BidirectionalCollection where Element: Equatable { - func ends(with suffix: C) -> Bool + func _ends(with suffix: C) -> Bool where C.Element == Element { - ends(with: FixedPatternConsumer(pattern: suffix)) + _ends(with: FixedPatternConsumer(pattern: suffix)) } } @@ -56,10 +56,10 @@ extension BidirectionalCollection where SubSequence == Substring { /// - Returns: `true` if the initial elements of the sequence matches the /// beginning of `regex`; otherwise, `false`. public func starts(with regex: R) -> Bool { - starts(with: RegexConsumer(regex)) + _starts(with: RegexConsumer(regex)) } - func ends(with regex: R) -> Bool { - ends(with: RegexConsumer(regex)) + func _ends(with regex: R) -> Bool { + _ends(with: RegexConsumer(regex)) } } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Trim.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Trim.swift index 7411236da..16a3cb207 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Trim.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Trim.swift @@ -12,7 +12,7 @@ // MARK: `CollectionConsumer` algorithms extension Collection { - func trimmingPrefix( + func _trimmingPrefix( _ consumer: Consumer ) -> SubSequence where Consumer.Consumed == Self { let start = consumer.consuming(self) ?? startIndex @@ -21,7 +21,7 @@ extension Collection { } extension Collection where SubSequence == Self { - mutating func trimPrefix( + mutating func _trimPrefix( _ consumer: Consumer ) where Consumer.Consumed == Self { _ = consumer.consume(&self) @@ -32,7 +32,7 @@ extension RangeReplaceableCollection { // NOTE: Disfavored because the `Collection with SubSequence == Self` overload // should be preferred whenever both are available @_disfavoredOverload - mutating func trimPrefix( + mutating func _trimPrefix( _ consumer: Consumer ) where Consumer.Consumed == Self { if let start = consumer.consuming(self) { @@ -42,7 +42,7 @@ extension RangeReplaceableCollection { } extension BidirectionalCollection { - func trimmingSuffix( + func _trimmingSuffix( _ consumer: Consumer ) -> SubSequence where Consumer.Consumed == Self @@ -51,7 +51,7 @@ extension BidirectionalCollection { return self[..( + func _trimming( _ consumer: Consumer ) -> SubSequence where Consumer.Consumed == Self { // NOTE: Might give different results than trimming the suffix before @@ -64,24 +64,24 @@ extension BidirectionalCollection { } extension BidirectionalCollection where SubSequence == Self { - mutating func trimSuffix( + mutating func _trimSuffix( _ consumer: Consumer ) where Consumer.Consumed == SubSequence { _ = consumer.consumeBack(&self) } - mutating func trim( + mutating func _trim( _ consumer: Consumer ) where Consumer.Consumed == Self { - trimPrefix(consumer) - trimSuffix(consumer) + _trimPrefix(consumer) + _trimSuffix(consumer) } } extension RangeReplaceableCollection where Self: BidirectionalCollection { @_disfavoredOverload - mutating func trimSuffix( + mutating func _trimSuffix( _ consumer: Consumer ) where Consumer.Consumed == Self { @@ -91,11 +91,11 @@ extension RangeReplaceableCollection where Self: BidirectionalCollection { } @_disfavoredOverload - mutating func trim( + mutating func _trim( _ consumer: Consumer ) where Consumer.Consumed == Self { - trimSuffix(consumer) - trimPrefix(consumer) + _trimSuffix(consumer) + _trimPrefix(consumer) } } @@ -137,49 +137,49 @@ extension RangeReplaceableCollection { } extension BidirectionalCollection { - func trimmingSuffix( + func _trimmingSuffix( while predicate: @escaping (Element) -> Bool ) -> SubSequence { - trimmingSuffix(ManyConsumer(base: PredicateConsumer(predicate: predicate))) + _trimmingSuffix(ManyConsumer(base: PredicateConsumer(predicate: predicate))) } - func trimming( + func _trimming( while predicate: @escaping (Element) -> Bool ) -> SubSequence { - trimming(ManyConsumer(base: PredicateConsumer(predicate: predicate))) + _trimming(ManyConsumer(base: PredicateConsumer(predicate: predicate))) } } extension BidirectionalCollection where SubSequence == Self { - mutating func trimSuffix( + mutating func _trimSuffix( while predicate: @escaping (Element) -> Bool ) { - trimSuffix(ManyConsumer( + _trimSuffix(ManyConsumer( base: PredicateConsumer(predicate: predicate))) } - mutating func trim(while predicate: @escaping (Element) -> Bool) { + mutating func _trim(while predicate: @escaping (Element) -> Bool) { let consumer = ManyConsumer( base: PredicateConsumer(predicate: predicate)) - trimPrefix(consumer) - trimSuffix(consumer) + _trimPrefix(consumer) + _trimSuffix(consumer) } } extension RangeReplaceableCollection where Self: BidirectionalCollection { @_disfavoredOverload - mutating func trimSuffix( + mutating func _trimSuffix( while predicate: @escaping (Element) -> Bool ) { - trimSuffix(ManyConsumer(base: PredicateConsumer(predicate: predicate))) + _trimSuffix(ManyConsumer(base: PredicateConsumer(predicate: predicate))) } @_disfavoredOverload - mutating func trim(while predicate: @escaping (Element) -> Bool) { + mutating func _trim(while predicate: @escaping (Element) -> Bool) { let consumer = ManyConsumer( base: PredicateConsumer(predicate: predicate)) - trimPrefix(consumer) - trimSuffix(consumer) + _trimPrefix(consumer) + _trimSuffix(consumer) } } @@ -197,7 +197,7 @@ extension Collection where Element: Equatable { public func trimmingPrefix( _ prefix: Prefix ) -> SubSequence where Prefix.Element == Element { - trimmingPrefix(FixedPatternConsumer(pattern: prefix)) + _trimmingPrefix(FixedPatternConsumer(pattern: prefix)) } } @@ -211,12 +211,11 @@ extension Collection where SubSequence == Self, Element: Equatable { public mutating func trimPrefix( _ prefix: Prefix ) where Prefix.Element == Element { - trimPrefix(FixedPatternConsumer(pattern: prefix)) + _trimPrefix(FixedPatternConsumer(pattern: prefix)) } } extension RangeReplaceableCollection where Element: Equatable { - @_disfavoredOverload /// Removes the initial elements that satisfy the given predicate from the /// start of the sequence. /// - Parameter predicate: A closure that takes an element of the sequence @@ -226,39 +225,39 @@ extension RangeReplaceableCollection where Element: Equatable { public mutating func trimPrefix( _ prefix: Prefix ) where Prefix.Element == Element { - trimPrefix(FixedPatternConsumer(pattern: prefix)) + _trimPrefix(FixedPatternConsumer(pattern: prefix)) } } extension BidirectionalCollection where Element: Equatable { - func trimmingSuffix( + func _trimmingSuffix( _ suffix: Suffix ) -> SubSequence where Suffix.Element == Element { - trimmingSuffix(FixedPatternConsumer(pattern: suffix)) + _trimmingSuffix(FixedPatternConsumer(pattern: suffix)) } - func trimming( + func _trimming( _ pattern: Pattern ) -> SubSequence where Pattern.Element == Element { - trimming(FixedPatternConsumer(pattern: pattern)) + _trimming(FixedPatternConsumer(pattern: pattern)) } } extension BidirectionalCollection where SubSequence == Self, Element: Equatable { - mutating func trimSuffix( + mutating func _trimSuffix( _ suffix: Suffix ) where Suffix.Element == Element { - trimSuffix(FixedPatternConsumer(pattern: suffix)) + _trimSuffix(FixedPatternConsumer(pattern: suffix)) } - mutating func trim( + mutating func _trim( _ pattern: Pattern ) where Pattern.Element == Element { let consumer = FixedPatternConsumer(pattern: pattern) - trimPrefix(consumer) - trimSuffix(consumer) + _trimPrefix(consumer) + _trimSuffix(consumer) } } @@ -266,19 +265,19 @@ extension RangeReplaceableCollection where Self: BidirectionalCollection, Element: Equatable { @_disfavoredOverload - mutating func trimSuffix( + mutating func _trimSuffix( _ prefix: Suffix ) where Suffix.Element == Element { - trimSuffix(FixedPatternConsumer(pattern: prefix)) + _trimSuffix(FixedPatternConsumer(pattern: prefix)) } @_disfavoredOverload - mutating func trim( + mutating func _trim( _ pattern: Pattern ) where Pattern.Element == Element { let consumer = FixedPatternConsumer(pattern: pattern) - trimPrefix(consumer) - trimSuffix(consumer) + _trimPrefix(consumer) + _trimSuffix(consumer) } } @@ -290,19 +289,20 @@ extension BidirectionalCollection where SubSequence == Substring { /// - Parameter prefix: The collection to remove from this collection. /// - Returns: A collection containing the elements that does not match /// `prefix` from the start. + @_disfavoredOverload @available(SwiftStdlib 5.7, *) public func trimmingPrefix(_ regex: R) -> SubSequence { - trimmingPrefix(RegexConsumer(regex)) + _trimmingPrefix(RegexConsumer(regex)) } @available(SwiftStdlib 5.7, *) - func trimmingSuffix(_ regex: R) -> SubSequence { - trimmingSuffix(RegexConsumer(regex)) + func _trimmingSuffix(_ regex: R) -> SubSequence { + _trimmingSuffix(RegexConsumer(regex)) } @available(SwiftStdlib 5.7, *) - func trimming(_ regex: R) -> SubSequence { - trimming(RegexConsumer(regex)) + func _trimming(_ regex: R) -> SubSequence { + _trimming(RegexConsumer(regex)) } } @@ -311,39 +311,40 @@ extension RangeReplaceableCollection { /// Removes the initial elements that matches the given regex. /// - Parameter regex: The regex to remove from this collection. + @_disfavoredOverload @available(SwiftStdlib 5.7, *) public mutating func trimPrefix(_ regex: R) { - trimPrefix(RegexConsumer(regex)) + _trimPrefix(RegexConsumer(regex)) } @available(SwiftStdlib 5.7, *) - mutating func trimSuffix(_ regex: R) { - trimSuffix(RegexConsumer(regex)) + mutating func _trimSuffix(_ regex: R) { + _trimSuffix(RegexConsumer(regex)) } @available(SwiftStdlib 5.7, *) - mutating func trim(_ regex: R) { + mutating func _trim(_ regex: R) { let consumer = RegexConsumer(regex) - trimPrefix(consumer) - trimSuffix(consumer) + _trimPrefix(consumer) + _trimSuffix(consumer) } } extension Substring { @available(SwiftStdlib 5.7, *) - mutating func trimPrefix(_ regex: R) { - trimPrefix(RegexConsumer(regex)) + mutating func _trimPrefix(_ regex: R) { + _trimPrefix(RegexConsumer(regex)) } @available(SwiftStdlib 5.7, *) - mutating func trimSuffix(_ regex: R) { - trimSuffix(RegexConsumer(regex)) + mutating func _trimSuffix(_ regex: R) { + _trimSuffix(RegexConsumer(regex)) } @available(SwiftStdlib 5.7, *) - mutating func trim(_ regex: R) { + mutating func _trim(_ regex: R) { let consumer = RegexConsumer(regex) - trimPrefix(consumer) - trimSuffix(consumer) + _trimPrefix(consumer) + _trimSuffix(consumer) } } diff --git a/Sources/_StringProcessing/Algorithms/Matching/FirstMatch.swift b/Sources/_StringProcessing/Algorithms/Matching/FirstMatch.swift index 4342391af..2b6dd1704 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/FirstMatch.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/FirstMatch.swift @@ -12,7 +12,7 @@ // MARK: `CollectionSearcher` algorithms extension Collection { - func firstMatch( + func _firstMatch( of searcher: S ) -> _MatchResult? where S.Searched == Self { var state = searcher.state(for: self, in: startIndex..( of regex: R ) -> _MatchResult>? { - firstMatch(of: RegexConsumer(regex)) + _firstMatch(of: RegexConsumer(regex)) } @available(SwiftStdlib 5.7, *) diff --git a/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift b/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift index 206d68554..38224e30f 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift @@ -12,7 +12,7 @@ // MARK: `MatchingCollectionSearcher` algorithms extension RangeReplaceableCollection { - func replacing< + func _replacing< Searcher: MatchingCollectionSearcher, Replacement: Collection >( _ searcher: Searcher, @@ -28,7 +28,7 @@ extension RangeReplaceableCollection { var result = Self() result.append(contentsOf: self[..( _ searcher: Searcher, @@ -49,14 +49,14 @@ extension RangeReplaceableCollection { ) rethrows -> Self where Searcher.Searched == SubSequence, Replacement.Element == Element { - try replacing( + try _replacing( searcher, with: replacement, subrange: startIndex..( _ searcher: Searcher, @@ -65,7 +65,7 @@ extension RangeReplaceableCollection { ) rethrows where Searcher.Searched == SubSequence, Replacement.Element == Element { - self = try replacing( + self = try _replacing( searcher, with: replacement, maxReplacements: maxReplacements) @@ -76,13 +76,13 @@ extension RangeReplaceableCollection { extension RangeReplaceableCollection where SubSequence == Substring { @available(SwiftStdlib 5.7, *) - func replacing( + func _replacing( _ regex: R, with replacement: (_MatchResult>) throws -> Replacement, subrange: Range, maxReplacements: Int = .max ) rethrows -> Self where Replacement.Element == Element { - try replacing( + try _replacing( RegexConsumer(regex), with: replacement, subrange: subrange, @@ -90,12 +90,12 @@ extension RangeReplaceableCollection where SubSequence == Substring { } @available(SwiftStdlib 5.7, *) - func replacing( + func _replacing( _ regex: R, with replacement: (_MatchResult>) throws -> Replacement, maxReplacements: Int = .max ) rethrows -> Self where Replacement.Element == Element { - try replacing( + try _replacing( regex, with: replacement, subrange: startIndex..( + mutating func _replace( _ regex: R, with replacement: (_MatchResult>) throws -> Replacement, maxReplacements: Int = .max ) rethrows where Replacement.Element == Element { - self = try replacing( + self = try _replacing( regex, with: replacement, maxReplacements: maxReplacements) diff --git a/Sources/_StringProcessing/Algorithms/Matching/Matches.swift b/Sources/_StringProcessing/Algorithms/Matching/Matches.swift index f038616fe..293520735 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/Matches.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/Matches.swift @@ -166,7 +166,7 @@ extension ReversedMatchesCollection: Sequence { // MARK: `CollectionSearcher` algorithms extension Collection { - func matches( + func _matches( of searcher: S ) -> MatchesCollection where S.Searched == Self { MatchesCollection(base: self, searcher: searcher) @@ -174,7 +174,7 @@ extension Collection { } extension BidirectionalCollection { - func matchesFromBack( + func _matchesFromBack( of searcher: S ) -> ReversedMatchesCollection where S.BackwardSearched == Self { ReversedMatchesCollection(base: self, searcher: searcher) @@ -186,17 +186,17 @@ extension BidirectionalCollection { extension BidirectionalCollection where SubSequence == Substring { @available(SwiftStdlib 5.7, *) @_disfavoredOverload - func matches( + func _matches( of regex: R ) -> MatchesCollection> { - matches(of: RegexConsumer(regex)) + _matches(of: RegexConsumer(regex)) } @available(SwiftStdlib 5.7, *) - func matchesFromBack( + func _matchesFromBack( of regex: R ) -> ReversedMatchesCollection> { - matchesFromBack(of: RegexConsumer(regex)) + _matchesFromBack(of: RegexConsumer(regex)) } // FIXME: Return `some Collection.Match> for SE-0346 @@ -213,14 +213,22 @@ extension BidirectionalCollection where SubSequence == Substring { let regex = r.regex var result = [Regex.Match]() - while start < end { + while start <= end { guard let match = try? regex._firstMatch( slice.base, in: start.. let (criticalIndex, periodOfSecondPart) = pattern._criticalFactorization(<) let periodIsExact = pattern[criticalIndex...] .prefix(periodOfSecondPart) - .ends(with: pattern[.. Program { builder.buildAccept() @@ -35,6 +40,9 @@ extension Compiler.ByteCodeGen { builder.buildUnresolvedReference(id: id) case let .changeMatchingOptions(optionSequence): + if !builder.hasReceivedInstructions { + builder.initialOptions.apply(optionSequence.ast) + } options.apply(optionSequence.ast) case let .unconverted(astAtom): @@ -59,7 +67,9 @@ extension Compiler.ByteCodeGen { case .absolute(let i): // Backreferences number starting at 1 builder.buildBackreference(.init(i-1)) - case .relative, .named: + case .named(let name): + try builder.buildNamedReference(name) + case .relative: throw Unsupported("Backreference kind: \(ref)") } } @@ -188,7 +198,6 @@ extension Compiler.ByteCodeGen { mutating func emitCharacter(_ c: Character) throws { // Unicode scalar matches the specific scalars that comprise a character if options.semanticLevel == .unicodeScalar { - print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars") for scalar in c.unicodeScalars { try emitScalar(scalar) } @@ -379,6 +388,9 @@ extension Compiler.ByteCodeGen { throw Unreachable("These should produce a capture node") case .changeMatchingOptions(let optionSequence): + if !builder.hasReceivedInstructions { + builder.initialOptions.apply(optionSequence) + } options.apply(optionSequence) try emitNode(child) diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift index 51428acee..fe00bdc0f 100644 --- a/Sources/_StringProcessing/Capture.swift +++ b/Sources/_StringProcessing/Capture.swift @@ -11,27 +11,6 @@ @_implementationOnly import _RegexParser -/// A structured capture -struct StructuredCapture { - /// The `.optional` height of the result - var optionalCount = 0 - - var storedCapture: StoredCapture? - - var someCount: Int { - storedCapture == nil ? optionalCount - 1 : optionalCount - } -} - -/// A storage form for a successful capture -struct StoredCapture { - // TODO: drop optional when engine tracks all ranges - var range: Range? - - // If strongly typed, value is set - var value: Any? = nil -} - // TODO: Where should this live? Inside TypeConstruction? func constructExistentialOutputComponent( from input: Substring, @@ -61,26 +40,30 @@ func constructExistentialOutputComponent( return underlying } -extension StructuredCapture { +@available(SwiftStdlib 5.7, *) +extension AnyRegexOutput.Element { func existentialOutputComponent( from input: Substring ) -> Any { constructExistentialOutputComponent( from: input, - in: storedCapture?.range, - value: storedCapture?.value, - optionalCount: optionalCount) + in: range, + value: value, + optionalCount: optionalDepth + ) } func slice(from input: String) -> Substring? { - guard let r = storedCapture?.range else { return nil } + guard let r = range else { return nil } return input[r] } } -extension Sequence where Element == StructuredCapture { +@available(SwiftStdlib 5.7, *) +extension Sequence where Element == AnyRegexOutput.Element { // FIXME: This is a stop gap where we still slice the input // and traffic through existentials + @available(SwiftStdlib 5.7, *) func existentialOutput( from input: Substring ) -> Any { diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 1c20761c8..4d97c5758 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -27,8 +27,9 @@ class Compiler { __consuming func emit() throws -> Program { // TODO: Handle global options - var codegen = ByteCodeGen(options: options) - codegen.builder.captureList = tree.root._captureList + var codegen = ByteCodeGen( + options: options, captureList: tree.root._captureList + ) try codegen.emitNode(tree.root) let program = try codegen.finish() return program @@ -44,11 +45,11 @@ func _compileRegex( } // An error produced when compiling a regular expression. -public enum RegexCompilationError: Error, CustomStringConvertible { +enum RegexCompilationError: Error, CustomStringConvertible { // TODO: Source location? case uncapturedReference - public var description: String { + var description: String { switch self { case .uncapturedReference: return "Found a reference used before it captured any match." diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 48f353e52..640fe3c93 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -145,10 +145,7 @@ extension String { } func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram.ConsumeFunction { - let consume = opts.semanticLevel == .graphemeCluster - ? consumeCharacterWithSingleScalar - : consumeScalar - + let consume = consumeFunction(for: opts) return consume(propertyScalarPredicate { // FIXME: name aliases not covered by $0.nameAlias are missed // e.g. U+FEFF has both 'BYTE ORDER MARK' and 'BOM' as aliases @@ -326,7 +323,7 @@ extension DSLTree.CustomCharacterClass.Member { case .quotedLiteral(let s): if opts.isCaseInsensitive { return { input, bounds in - guard s.lowercased().contains(input[bounds.lowerBound].lowercased()) else { + guard s.lowercased()._contains(input[bounds.lowerBound].lowercased()) else { return nil } return input.index(after: bounds.lowerBound) @@ -491,14 +488,41 @@ extension AST.Atom.CharacterProperty { case .named(let n): return consumeName(n, opts: opts) + case .age(let major, let minor): + return consume { + guard let age = $0.properties.age else { return false } + return age <= (major, minor) + } + + case .numericValue(let value): + return consume { $0.properties.numericValue == value } + + case .numericType(let type): + return consume { $0.properties.numericType == type } + + case .ccc(let ccc): + return consume { $0.properties.canonicalCombiningClass == ccc } + + case .mapping(.lowercase, let value): + return consume { $0.properties.lowercaseMapping == value } + + case .mapping(.uppercase, let value): + return consume { $0.properties.uppercaseMapping == value } + + case .mapping(.titlecase, let value): + return consume { $0.properties.titlecaseMapping == value } + + case .block(let b): + throw Unsupported("TODO: map block: \(b)") + case .posix(let p): return p.generateConsumer(opts) case .pcreSpecial(let s): throw Unsupported("TODO: map PCRE special: \(s)") - case .onigurumaSpecial(let s): - throw Unsupported("TODO: map Oniguruma special: \(s)") + case .javaSpecial(let s): + throw Unsupported("TODO: map Java special: \(s)") } }() @@ -525,7 +549,7 @@ extension Unicode.BinaryProperty { case .alphabetic: return consume(propertyScalarPredicate(\.isAlphabetic)) case .bidiControl: - break + return consume(propertyScalarPredicate(\.isBidiControl)) case .bidiMirrored: return consume(propertyScalarPredicate(\.isBidiMirrored)) case .cased: diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index cae8194bd..f706c0471 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -39,11 +39,12 @@ extension MEProgram where Input.Element: Hashable { var failAddressToken: AddressToken? = nil var captureList = CaptureList() + var initialOptions = MatchingOptions() // Symbolic reference resolution var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:] var referencedCaptureOffsets: [ReferenceID: Int] = [:] - var namedCaptureOffsets: [String: Int] = [:] + var captureCount: Int { // We currently deduce the capture count from the capture register number. nextCaptureRegister.rawValue @@ -77,6 +78,11 @@ extension MEProgram.Builder { var lastInstructionAddress: InstructionAddress { .init(instructions.endIndex - 1) } + + /// `true` if the builder has received any instructions. + var hasReceivedInstructions: Bool { + !instructions.isEmpty + } mutating func buildNop(_ r: StringRegister? = nil) { instructions.append(.init(.nop, .init(optionalString: r))) @@ -278,6 +284,13 @@ extension MEProgram.Builder { unresolvedReferences[id, default: []].append(lastInstructionAddress) } + mutating func buildNamedReference(_ name: String) throws { + guard let index = captureList.indexOfCapture(named: name) else { + throw RegexCompilationError.uncapturedReference + } + buildBackreference(.init(index)) + } + // TODO: Mutating because of fail address fixup, drop when // that's removed mutating func assemble() throws -> MEProgram { @@ -353,7 +366,7 @@ extension MEProgram.Builder { registerInfo: regInfo, captureList: captureList, referencedCaptureOffsets: referencedCaptureOffsets, - namedCaptureOffsets: namedCaptureOffsets) + initialOptions: initialOptions) } mutating func reset() { self = Self() } @@ -449,9 +462,10 @@ extension MEProgram.Builder { assert(preexistingValue == nil) } if let name = name { - // TODO: Reject duplicate capture names unless `(?J)`? - namedCaptureOffsets.updateValue(captureCount, forKey: name) + let index = captureList.indexOfCapture(named: name) + assert(index == nextCaptureRegister.rawValue) } + assert(nextCaptureRegister.rawValue < captureList.captures.count) return nextCaptureRegister } diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift index e3a542c1e..7003c0261 100644 --- a/Sources/_StringProcessing/Engine/MECapture.swift +++ b/Sources/_StringProcessing/Engine/MECapture.swift @@ -145,7 +145,6 @@ extension Processor._StoredCapture: CustomStringConvertible { struct MECaptureList { var values: Array._StoredCapture> var referencedCaptureOffsets: [ReferenceID: Int] - var namedCaptureOffsets: [String: Int] // func extract(from s: String) -> Array> { // caps.map { $0.map { s[$0] } } diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index 8f1c721b0..8b4737e7a 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -36,7 +36,8 @@ struct MEProgram where Input.Element: Equatable { let captureList: CaptureList let referencedCaptureOffsets: [ReferenceID: Int] - let namedCaptureOffsets: [String: Int] + + var initialOptions: MatchingOptions } extension MEProgram: CustomStringConvertible { diff --git a/Sources/_StringProcessing/Engine/Structuralize.swift b/Sources/_StringProcessing/Engine/Structuralize.swift index a8cfeb20c..129ac1677 100644 --- a/Sources/_StringProcessing/Engine/Structuralize.swift +++ b/Sources/_StringProcessing/Engine/Structuralize.swift @@ -1,20 +1,27 @@ @_implementationOnly import _RegexParser extension CaptureList { - func structuralize( + @available(SwiftStdlib 5.7, *) + func createElements( _ list: MECaptureList, _ input: String - ) -> [StructuredCapture] { + ) -> [AnyRegexOutput.ElementRepresentation] { assert(list.values.count == captures.count) - - var result = [StructuredCapture]() - for (cap, meStored) in zip(self.captures, list.values) { - let stored = StoredCapture( - range: meStored.latest, value: meStored.latestValue) - - result.append(.init( - optionalCount: cap.optionalDepth, storedCapture: stored)) + + var result = [AnyRegexOutput.ElementRepresentation]() + + for (i, (cap, meStored)) in zip(captures, list.values).enumerated() { + let element = AnyRegexOutput.ElementRepresentation( + optionalDepth: cap.optionalDepth, + bounds: meStored.latest, + name: cap.name, + referenceID: list.referencedCaptureOffsets.first { $1 == i }?.key, + value: meStored.latestValue + ) + + result.append(element) } + return result } } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index e44b110e5..4f428cf06 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -37,11 +37,10 @@ struct Executor { let capList = MECaptureList( values: cpu.storedCaptures, - referencedCaptureOffsets: engine.program.referencedCaptureOffsets, - namedCaptureOffsets: engine.program.namedCaptureOffsets) + referencedCaptureOffsets: engine.program.referencedCaptureOffsets) let range = inputRange.lowerBound.. Bool { + stack.last == other.stack.last + } } // MARK: Matching behavior API @@ -127,6 +134,7 @@ extension MatchingOptions { } } +// MARK: - Implementation extension MatchingOptions { /// An option that changes the behavior of a regular expression. fileprivate enum Option: Int { @@ -205,9 +213,6 @@ extension MatchingOptions { // Whitespace options are only relevant during parsing, not compilation. case .extended, .extraExtended: return nil - @unknown default: - // Ignore unknown - return nil } } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 601447968..3058a3956 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -17,20 +17,26 @@ // incremental conversion, such that leaves remain // as canonical regex literals. +/// Renders an AST tree as a Pattern DSL. +/// +/// - Parameters: +/// - ast: A `_RegexParser.AST` instance. +/// - maxTopDownLevels: The number of levels down from the root of the tree +/// to perform conversion. `nil` means no limit. +/// - minBottomUpLevels: The number of levels up from the leaves of the tree +/// to perform conversion. `nil` means no limit. +/// - Returns: A string representation of `ast` in the `RegexBuilder` syntax. @_spi(PatternConverter) -extension AST { - /// Renders as a Pattern DSL. - @_spi(PatternConverter) - public func renderAsBuilderDSL( - maxTopDownLevels: Int? = nil, - minBottomUpLevels: Int? = nil - ) -> String { - var printer = PrettyPrinter( - maxTopDownLevels: maxTopDownLevels, - minBottomUpLevels: minBottomUpLevels) - printer.printAsPattern(self) - return printer.finish() - } +public func renderAsBuilderDSL( + ast: Any, + maxTopDownLevels: Int? = nil, + minBottomUpLevels: Int? = nil +) -> String { + var printer = PrettyPrinter( + maxTopDownLevels: maxTopDownLevels, + minBottomUpLevels: minBottomUpLevels) + printer.printAsPattern(ast as! AST) + return printer.finish() } extension PrettyPrinter { @@ -137,6 +143,51 @@ extension PrettyPrinter { blockName = "\(amount)" } + // Special case single child character classes for repetition nodes. + // This lets us do something like the following: + // + // OneOrMore(.digit) + // vs + // OneOrMore { + // One(.digit) + // } + // + func printSimpleCCC( + _ ccc: DSLTree.CustomCharacterClass + ) { + indent() + + if kind != ".eager" { + blockName.removeLast() + output("\(blockName), ") + } else { + output("\(blockName)(") + } + + printAsPattern(ccc, wrap: false, terminateLine: false) + output(")") + terminateLine() + } + + switch child { + case let .customCharacterClass(ccc): + if ccc.isSimplePrint { + printSimpleCCC(ccc) + return + } + + break + case let .convertedRegexLiteral(.customCharacterClass(ccc), _): + if ccc.isSimplePrint { + printSimpleCCC(ccc) + return + } + + break + default: + break + } + printBlock(blockName) { printer in printer.printAsPattern(convertedFromAST: child) } @@ -187,10 +238,10 @@ extension PrettyPrinter { print("/* TODO: absent function */") } } - - // TODO: Some way to integrate this with conversion... + mutating func printAsPattern( _ ccc: DSLTree.CustomCharacterClass, + wrap: Bool = true, terminateLine: Bool = true ) { if ccc.hasUnprintableProperty { @@ -202,11 +253,10 @@ extension PrettyPrinter { if ccc.isInverted { printIndented { printer in printer.indent() + printer.output(".inverted") if terminateLine { - printer.print(".inverted") - } else { - printer.output(".inverted") + printer.terminateLine() } } } @@ -215,7 +265,7 @@ extension PrettyPrinter { // If we only have 1 member, then we can emit it without the extra // CharacterClass initialization if ccc.members.count == 1 { - printAsPattern(ccc.members[0]) + printAsPattern(ccc.members[0], wrap: wrap) if terminateLine { self.terminateLine() @@ -279,12 +329,20 @@ extension PrettyPrinter { // Also in the same vein, if we have a few atom members but no // nonAtomMembers, then we can emit a single .anyOf(...) for them. if !charMembers.isEmpty, nonCharMembers.isEmpty { - if terminateLine { - print(".anyOf(\(charMembers._quoted))") + let anyOf = ".anyOf(\(charMembers._quoted))" + + indent() + + if wrap { + output("One(\(anyOf))") } else { - indent() - output(".anyOf(\(charMembers._quoted))") + output(anyOf) } + + if terminateLine { + self.terminateLine() + } + return } @@ -304,7 +362,7 @@ extension PrettyPrinter { } for (i, member) in nonCharMembers.enumerated() { - printer.printAsPattern(member) + printer.printAsPattern(member, wrap: false) if i != nonCharMembers.count - 1 { printer.output(",") @@ -314,17 +372,18 @@ extension PrettyPrinter { } } + indent() + output(")") + if terminateLine { - print(")") - } else { - indent() - output(")") + self.terminateLine() } } // TODO: Some way to integrate this with conversion... mutating func printAsPattern( - _ member: DSLTree.CustomCharacterClass.Member + _ member: DSLTree.CustomCharacterClass.Member, + wrap: Bool = true ) { switch member { case let .custom(ccc): @@ -344,48 +403,98 @@ extension PrettyPrinter { indent() switch a { case let .char(c): - output(".anyOf(\(String(c)._quoted))") + + if wrap { + output("One(.anyOf(\(String(c)._quoted)))") + } else { + output(".anyOf(\(String(c)._quoted))") + } + case let .scalar(s): - output(".anyOf(\"\\u{\(String(s.value, radix: 16))}\")") + + if wrap { + output("One(.anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\"))") + } else { + output(".anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\")") + } + case let .unconverted(a): - output(a.ast._patternBase) + let base = a.ast._patternBase + + if base.canBeWrapped, wrap { + output("One(\(base.0))") + } else { + output(base.0) + } default: print(" // TODO: Atom \(a)") } case .quotedLiteral(let s): - output(".anyOf(\(s._quoted))") + + if wrap { + output("One(.anyOf(\(s._quoted)))") + } else { + output(".anyOf(\(s._quoted))") + } case .trivia(_): // We never print trivia break case .intersection(let first, let second): - printAsPattern(first) + if wrap, first.isSimplePrint { + indent() + output("One(") + } + + printAsPattern(first, wrap: false) printIndented { printer in printer.indent() printer.output(".intersection(") - printer.printAsPattern(second, terminateLine: false) + printer.printAsPattern(second, wrap: false, terminateLine: false) printer.output(")") } + if wrap, first.isSimplePrint { + output(")") + } + case .subtraction(let first, let second): - printAsPattern(first) + if wrap, first.isSimplePrint { + indent() + output("One(") + } + + printAsPattern(first, wrap: false) printIndented { printer in printer.indent() printer.output(".subtracting(") - printer.printAsPattern(second, terminateLine: false) + printer.printAsPattern(second, wrap: false, terminateLine: false) printer.output(")") } + if wrap, first.isSimplePrint { + output(")") + } + case .symmetricDifference(let first, let second): - printAsPattern(first) + if wrap, first.isSimplePrint { + indent() + output("One(") + } + + printAsPattern(first, wrap: false) printIndented { printer in printer.indent() printer.output(".symmetricDifference(") - printer.printAsPattern(second, terminateLine: false) + printer.printAsPattern(second, wrap: false, terminateLine: false) printer.output(")") } + + if wrap, first.isSimplePrint { + output(")") + } } } @@ -469,7 +578,7 @@ extension PrettyPrinter { extension String { // TODO: Escaping? fileprivate var _quoted: String { - "\"\(self.replacing("\"", with: "\\\""))\"" + "\"\(self._replacing("\"", with: "\\\""))\"" } } @@ -658,118 +767,118 @@ extension AST.Atom { /// caller, but we might want to be parameterized at that point. /// /// TODO: Some way to integrate this with conversion... - var _patternBase: String { + var _patternBase: (String, canBeWrapped: Bool) { if let anchor = self.assertionKind { - return anchor._patternBase + return (anchor._patternBase, false) } if isUnprintableAtom { - return _regexBase + return (_regexBase, false) } return _dslBase } - var _dslBase: String { + var _dslBase: (String, canBeWrapped: Bool) { func scalarLiteral(_ s: UnicodeScalar) -> String { let hex = String(s.value, radix: 16, uppercase: true) return "\\u{\(hex)}" } switch kind { case let .char(c): - return String(c) + return (String(c), false) case let .scalar(s): - return scalarLiteral(s.value) + return (scalarLiteral(s.value), false) case let .scalarSequence(seq): - return seq.scalarValues.map(scalarLiteral).joined() + return (seq.scalarValues.map(scalarLiteral).joined(), false) case let .property(p): - return p._dslBase + return (p._dslBase, true) case let .escaped(e): switch e { // Anchors case .wordBoundary: - return "Anchor.wordBoundary" + return ("Anchor.wordBoundary", false) case .notWordBoundary: - return "Anchor.wordBoundary.inverted" + return ("Anchor.wordBoundary.inverted", false) case .startOfSubject: - return "Anchor.startOfSubject" + return ("Anchor.startOfSubject", false) case .endOfSubject: - return "Anchor.endOfSubject" + return ("Anchor.endOfSubject", false) case .endOfSubjectBeforeNewline: - return "Anchor.endOfSubjectBeforeNewline" + return ("Anchor.endOfSubjectBeforeNewline", false) case .firstMatchingPositionInSubject: - return "Anchor.firstMatchingPositionInSubject" + return ("Anchor.firstMatchingPositionInSubject", false) case .textSegment: - return "Anchor.textSegmentBoundary" + return ("Anchor.textSegmentBoundary", false) case .notTextSegment: - return "Anchor.textSegmentBoundary.inverted" + return ("Anchor.textSegmentBoundary.inverted", false) // Character Classes case .decimalDigit: - return ".digit" + return (".digit", true) case .notDecimalDigit: - return ".digit.inverted" + return (".digit.inverted", true) case .horizontalWhitespace: - return ".horizontalWhitespace" + return (".horizontalWhitespace", true) case .notHorizontalWhitespace: - return ".horizontalWhitespace.inverted" + return (".horizontalWhitespace.inverted", true) case .whitespace: - return ".whitespace" + return (".whitespace", true) case .notWhitespace: - return ".whitespace.inverted" + return (".whitespace.inverted", true) case .wordCharacter: - return ".word" + return (".word", true) case .notWordCharacter: - return ".word.inverted" + return (".word.inverted", true) case .graphemeCluster: - return ".anyGraphemeCluster" + return (".anyGraphemeCluster", true) case .newlineSequence: - return ".newlineSequence" + return (".newlineSequence", true) case .notNewline: - return ".newlineSequence.inverted" + return (".newlineSequence.inverted", true) case .verticalTab: - return ".verticalWhitespace" + return (".verticalWhitespace", true) case .notVerticalTab: - return ".verticalWhitespace.inverted" + return (".verticalWhitespace.inverted", true) // Literal single characters all get converted into DSLTree.Atom.scalar default: - return "TODO: escaped \(e)" + return ("TODO: escaped \(e)", false) } case .namedCharacter: - return " /* TODO: named character */" + return (" /* TODO: named character */", false) case .any: - return ".any" + return (".any", true) case .startOfLine, .endOfLine: fatalError("unreachable") case .backreference: - return " /* TODO: back reference */" + return (" /* TODO: back reference */", false) case .subpattern: - return " /* TODO: subpattern */" + return (" /* TODO: subpattern */", false) case .callout: - return " /* TODO: callout */" + return (" /* TODO: callout */", false) case .backtrackingDirective: - return " /* TODO: backtracking directive */" + return (" /* TODO: backtracking directive */", false) case .changeMatchingOptions: - return "/* TODO: change matching options */" + return ("/* TODO: change matching options */", false) // Every other case we've already decided cannot be represented inside the // DSL. default: - return "" + return ("", false) } } @@ -877,6 +986,52 @@ extension DSLTree.CustomCharacterClass { $0.isUnprintableMember } } + + var isSimplePrint: Bool { + if members.count == 1 { + switch members[0] { + case .intersection(_, _): + return false + case .subtraction(_, _): + return false + case .symmetricDifference(_, _): + return false + default: + return true + } + } + + let nonCharMembers = members.filter { + switch $0 { + case let .atom(a): + switch a { + case .char(_): + return false + case .scalar(_): + return false + case .unconverted(_): + return true + default: + return true + } + + case .quotedLiteral(_): + return false + + case .trivia(_): + return false + + default: + return true + } + } + + if nonCharMembers.isEmpty { + return true + } + + return false + } } extension DSLTree.Atom { @@ -896,7 +1051,7 @@ extension DSLTree.Atom { if a.ast.isUnprintableAtom { return "#/\(a.ast._regexBase)/#" } else { - return a.ast._dslBase + return a.ast._dslBase.0 } case .assertion(let a): diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 79a515033..320d10897 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -13,15 +13,7 @@ extension AST { var dslTree: DSLTree { - return DSLTree( - root.dslTreeNode, options: globalOptions?.dslTreeOptions) - } -} - -extension AST.GlobalMatchingOptionSequence { - var dslTreeOptions: DSLTree.Options { - // TODO: map options - return .init() + return DSLTree(root.dslTreeNode) } } @@ -137,6 +129,9 @@ extension AST.Node { case let .trivia(v): return .trivia(v.contents) + case .interpolation: + throw Unsupported("TODO: interpolation") + case let .atom(v): switch v.kind { case .scalarSequence(let seq): diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 6dd8e17b6..1b5ce346f 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -41,11 +41,13 @@ extension Regex.Match where Output == AnyRegexOutput { public subscript( dynamicMember keyPath: KeyPath<(Substring, _doNotUse: ()), Substring> ) -> Substring { - input[range] + anyRegexOutput.input[range] } public subscript(name: String) -> AnyRegexOutput.Element? { - namedCaptureOffsets[name].map { self[$0 + 1] } + anyRegexOutput.first { + $0.name == name + } } } @@ -53,18 +55,26 @@ extension Regex.Match where Output == AnyRegexOutput { @available(SwiftStdlib 5.7, *) public struct AnyRegexOutput { let input: String - let namedCaptureOffsets: [String: Int] - fileprivate let _elements: [ElementRepresentation] + let _elements: [ElementRepresentation] /// The underlying representation of the element of a type-erased regex /// output. - fileprivate struct ElementRepresentation { + internal struct ElementRepresentation { /// The depth of `Optioals`s wrapping the underlying value. For example, /// `Substring` has optional depth `0`, and `Int??` has optional depth `2`. let optionalDepth: Int /// The bounds of the output element. let bounds: Range? + + /// The name of the capture. + var name: String? = nil + + /// The capture reference this element refers to. + var referenceID: ReferenceID? = nil + + /// If the output vaule is strongly typed, then this will be set. + var value: Any? = nil } } @@ -75,14 +85,7 @@ extension AnyRegexOutput { /// Use this initializer to fit a regex with strongly typed captures into the /// use site of a dynamic regex, like one that was created from a string. public init(_ match: Regex.Match) { - // Note: We use type equality instead of `match.output as? ...` to prevent - // unexpected optional flattening. - if Output.self == AnyRegexOutput.self { - self = match.output as! AnyRegexOutput - return - } - fatalError("FIXME: Not implemented") - // self.init(input: match.input, _elements: ) + self = match.anyRegexOutput } /// Returns a typed output by converting the underlying value to the specified @@ -92,11 +95,8 @@ extension AnyRegexOutput { /// - Returns: The output, if the underlying value can be converted to the /// output type; otherwise `nil`. public func `as`(_ type: Output.Type = Output.self) -> Output? { - let elements = _elements.map { - StructuredCapture( - optionalCount: $0.optionalDepth, - storedCapture: .init(range: $0.bounds) - ).existentialOutputComponent(from: input[...]) + let elements = map { + $0.existentialOutputComponent(from: input[...]) } return TypeConstruction.tuple(of: elements) as? Output } @@ -104,24 +104,16 @@ extension AnyRegexOutput { @available(SwiftStdlib 5.7, *) extension AnyRegexOutput { - internal init( - input: String, namedCaptureOffsets: [String: Int], elements: C - ) where C.Element == StructuredCapture { + internal init(input: String, elements: [ElementRepresentation]) { self.init( input: input, - namedCaptureOffsets: namedCaptureOffsets, - _elements: elements.map(ElementRepresentation.init)) + _elements: elements + ) } } @available(SwiftStdlib 5.7, *) extension AnyRegexOutput.ElementRepresentation { - init(_ element: StructuredCapture) { - self.init( - optionalDepth: element.optionalCount, - bounds: element.storedCapture.flatMap(\.range)) - } - func value(forInput input: String) -> Any { // Ok for now because `existentialMatchComponent` // wont slice the input if there's no range to slice with @@ -133,7 +125,8 @@ extension AnyRegexOutput.ElementRepresentation { from: input, in: bounds, value: nil, - optionalCount: optionalDepth) + optionalCount: optionalDepth + ) } } @@ -142,12 +135,24 @@ extension AnyRegexOutput: RandomAccessCollection { public struct Element { fileprivate let representation: ElementRepresentation let input: String - + + var optionalDepth: Int { + representation.optionalDepth + } + + var name: String? { + representation.name + } + /// The range over which a value was captured. `nil` for no-capture. public var range: Range? { representation.bounds } - + + var referenceID: ReferenceID? { + representation.referenceID + } + /// The slice of the input over which a value was captured. `nil` for no-capture. public var substring: Substring? { range.map { input[$0] } @@ -155,7 +160,7 @@ extension AnyRegexOutput: RandomAccessCollection { /// The captured value, `nil` for no-capture public var value: Any? { - fatalError() + representation.value } } @@ -187,7 +192,9 @@ extension AnyRegexOutput: RandomAccessCollection { @available(SwiftStdlib 5.7, *) extension AnyRegexOutput { public subscript(name: String) -> Element? { - namedCaptureOffsets[name].map { self[$0 + 1] } + first { + $0.name == name + } } } @@ -198,19 +205,11 @@ extension Regex.Match where Output == AnyRegexOutput { /// Use this initializer to fit a regex match with strongly typed captures into the /// use site of a dynamic regex match, like one that was created from a string. public init(_ match: Regex.Match) { - fatalError("FIXME: Not implemented") - } - - /// Returns a typed match by converting the underlying values to the specified - /// types. - /// - /// - Parameter type: The expected output type. - /// - Returns: A match generic over the output type, if the underlying values - /// can be converted to the output type; otherwise, `nil`. - public func `as`( - _ type: Output.Type = Output.self - ) -> Regex.Match? { - fatalError("FIXME: Not implemented") + self.init( + anyRegexOutput: match.anyRegexOutput, + range: match.range, + value: match.value + ) } } @@ -231,7 +230,7 @@ extension Regex where Output == AnyRegexOutput { /// Use this initializer to fit a regex with strongly typed captures into the /// use site of a dynamic regex, i.e. one that was created from a string. public init(_ regex: Regex) { - fatalError("FIXME: Not implemented") + self.init(node: regex.root) } /// Returns a typed regex by converting the underlying types. @@ -242,6 +241,12 @@ extension Regex where Output == AnyRegexOutput { public func `as`( _ type: Output.Type = Output.self ) -> Regex? { - fatalError("FIXME: Not implemented") + let result = Regex(node: root) + + guard result._verifyType() else { + return nil + } + + return result } } diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 29d2267b2..5d2101afe 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -91,6 +91,18 @@ extension Regex { self.tree = tree } } + + /// The set of matching options that applies to the start of this regex. + /// + /// Note that the initial options may not apply to the entire regex. For + /// example, in this regex, only case insensitivity (`i`) and Unicode scalar + /// semantics (set by API) apply to the entire regex, while ASCII character + /// classes (`P`) is part of `initialOptions` but not global: + /// + /// let regex = /(?i)(?P:\d+\s*)abc/.semanticLevel(.unicodeScalar) + var initialOptions: MatchingOptions { + program.loweredProgram.initialOptions + } } @available(SwiftStdlib 5.7, *) @@ -102,6 +114,6 @@ extension Regex { @_spi(RegexBuilder) public init(node: DSLTree.Node) { - self.program = Program(tree: .init(node, options: nil)) + self.program = Program(tree: .init(node)) } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index ff057f2ee..8ca6dce8d 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -14,11 +14,9 @@ @_spi(RegexBuilder) public struct DSLTree { var root: Node - var options: Options? - init(_ r: Node, options: Options?) { + init(_ r: Node) { self.root = r - self.options = options } } @@ -507,8 +505,6 @@ extension DSLTree.Node { child._addCaptures(to: &list, optionalNesting: nesting) case .clearer, .repeater, .stopper: break - @unknown default: - fatalError() } case let .convertedRegexLiteral(n, _): diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index 8172e993b..78c9c8c9f 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -17,17 +17,11 @@ extension Regex { /// providing direct access to captures. @dynamicMemberLookup public struct Match { - let input: String + let anyRegexOutput: AnyRegexOutput /// The range of the overall match. public let range: Range - let rawCaptures: [StructuredCapture] - - let referencedCaptureOffsets: [ReferenceID: Int] - - let namedCaptureOffsets: [String: Int] - let value: Any? } } @@ -37,18 +31,21 @@ extension Regex.Match { /// The output produced from the match operation. public var output: Output { if Output.self == AnyRegexOutput.self { - let wholeMatchAsCapture = StructuredCapture( - optionalCount: 0, - storedCapture: StoredCapture(range: range, value: nil)) + let wholeMatchCapture = AnyRegexOutput.ElementRepresentation( + optionalDepth: 0, + bounds: range + ) + let output = AnyRegexOutput( - input: input, - namedCaptureOffsets: namedCaptureOffsets, - elements: [wholeMatchAsCapture] + rawCaptures) + input: anyRegexOutput.input, + _elements: [wholeMatchCapture] + anyRegexOutput._elements + ) + return output as! Output } else if Output.self == Substring.self { // FIXME: Plumb whole match (`.0`) through the matching engine. - return input[range] as! Output - } else if rawCaptures.isEmpty, value != nil { + return anyRegexOutput.input[range] as! Output + } else if anyRegexOutput.isEmpty, value != nil { // FIXME: This is a workaround for whole-match values not // being modeled as part of captures. We might want to // switch to a model where results are alongside captures @@ -57,7 +54,9 @@ extension Regex.Match { guard value == nil else { fatalError("FIXME: what would this mean?") } - let typeErasedMatch = rawCaptures.existentialOutput(from: input[range]) + let typeErasedMatch = anyRegexOutput.existentialOutput( + from: anyRegexOutput.input[range] + ) return typeErasedMatch as! Output } } @@ -77,12 +76,15 @@ extension Regex.Match { @_spi(RegexBuilder) public subscript(_ id: ReferenceID) -> Capture { - guard let offset = referencedCaptureOffsets[id] else { - preconditionFailure( - "Reference did not capture any match in the regex") + guard let element = anyRegexOutput.first( + where: { $0.referenceID == id } + ) else { + preconditionFailure("Reference did not capture any match in the regex") } - return rawCaptures[offset].existentialOutputComponent(from: input[...]) - as! Capture + + return element.existentialOutputComponent( + from: anyRegexOutput.input[...] + ) as! Capture } } @@ -154,13 +156,17 @@ extension Regex { var low = inputRange.lowerBound let high = inputRange.upperBound - while low < high { + while true { if let m = try _match(input, in: low..= high { return nil } + if regex.initialOptions.semanticLevel == .graphemeCluster { + input.formIndex(after: &low) + } else { + input.unicodeScalars.formIndex(after: &low) + } } - return nil } } @@ -173,7 +179,7 @@ extension BidirectionalCollection where SubSequence == Substring { public func wholeMatch( of r: R ) -> Regex.Match? { - try? r.regex.wholeMatch(in: self[...].base) + try? r.regex.wholeMatch(in: self[...]) } /// Checks for a match against the string, starting at its beginning. diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 78477e2b5..49a08430d 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -319,6 +319,14 @@ func charClass( return .custom(cc) } +func setOp( + _ lhs: AST.CustomCharacterClass.Member..., + op: AST.CustomCharacterClass.SetOp, + _ rhs: AST.CustomCharacterClass.Member... +) -> AST.CustomCharacterClass.Member { + .setOperation(lhs, .init(faking: op), rhs) +} + func quote(_ s: String) -> AST.Node { .quote(.init(s, .fake)) } @@ -408,7 +416,7 @@ func prop_m( func range_m( _ lower: AST.Atom, _ upper: AST.Atom ) -> AST.CustomCharacterClass.Member { - .range(.init(lower, .fake, upper)) + .range(.init(lower, .fake, upper, trivia: [])) } func range_m( _ lower: AST.Atom.Kind, _ upper: AST.Atom.Kind diff --git a/Sources/_StringProcessing/Utility/TypeVerification.swift b/Sources/_StringProcessing/Utility/TypeVerification.swift new file mode 100644 index 000000000..df0b59f2c --- /dev/null +++ b/Sources/_StringProcessing/Utility/TypeVerification.swift @@ -0,0 +1,53 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +@_implementationOnly import _RegexParser + +@available(SwiftStdlib 5.7, *) +extension Regex { + internal func _verifyType() -> Bool { + var tupleElements: [Any.Type] = [Substring.self] + var labels = " " + + for capture in program.tree.root._captureList.captures { + var captureType: Any.Type = capture.type ?? Substring.self + var i = capture.optionalDepth + + while i != 0 { + captureType = TypeConstruction.optionalType(of: captureType) + i -= 1 + } + + tupleElements.append(captureType) + + if let name = capture.name { + labels += name + } + + labels.unicodeScalars.append(" ") + } + + // If we have no captures, then our Regex must be Regex. + if tupleElements.count == 1 { + return Output.self == Substring.self + } + + let createdType = TypeConstruction.tupleType( + of: tupleElements, + + // If all of our labels are spaces, that means no actual label was added + // to the tuple. In that case, don't pass a label string. + labels: labels.all { $0 == " " } ? nil : labels + ) + + return Output.self == createdType + } +} diff --git a/Tests/RegexBuilderTests/AlgorithmsTests.swift b/Tests/RegexBuilderTests/AlgorithmsTests.swift index 173d41598..def55bc17 100644 --- a/Tests/RegexBuilderTests/AlgorithmsTests.swift +++ b/Tests/RegexBuilderTests/AlgorithmsTests.swift @@ -13,7 +13,6 @@ import XCTest import _StringProcessing import RegexBuilder -@available(SwiftStdlib 5.7, *) class RegexConsumerTests: XCTestCase { func testMatches() { let regex = Capture(OneOrMore(.digit)) { 2 * Int($0)! } @@ -230,6 +229,16 @@ class AlgorithmsResultBuilderTests: XCTestCase { } func testMatches() throws { + do { + let regex = Regex { OneOrMore(.any) } + XCTAssertEqual("abc".wholeMatch(of: regex)!.0, "abc") + XCTAssertEqual("abc".prefixMatch(of: regex)!.0, "abc") + XCTAssertEqual("abc".firstMatch(of: regex)!.0, "abc") + XCTAssertEqual("abc".suffix(1).wholeMatch(of: regex)!.0, "c") + XCTAssertEqual("abc".suffix(1).prefixMatch(of: regex)!.0, "c") + XCTAssertEqual("abc".suffix(1).firstMatch(of: regex)!.0, "c") + } + let int = Capture(OneOrMore(.digit)) { Int($0)! } // Test syntax diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index c0c6491ac..f325b579f 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -66,7 +66,7 @@ class RegexDSLTests: XCTestCase { ("a c", ("a c", " ", "c")), matchType: (Substring, Substring, Substring).self, ==) { - .any + One(.any) Capture(.whitespace) // Substring Capture("c") // Substring } @@ -344,7 +344,7 @@ class RegexDSLTests: XCTestCase { matchType: (Substring, Substring).self, ==) { OneOrMore(.reluctant) { - .word + One(.word) }.repetitionBehavior(.possessive) Capture(.digit) ZeroOrMore(.any) @@ -615,13 +615,13 @@ class RegexDSLTests: XCTestCase { func testUnicodeScalarPostProcessing() throws { let spaces = Regex { ZeroOrMore { - .whitespace + One(.whitespace) } } let unicodeScalar = Regex { OneOrMore { - .hexDigit + One(.hexDigit) } spaces } @@ -637,14 +637,10 @@ class RegexDSLTests: XCTestCase { spaces Capture { - OneOrMore { - .word - } + OneOrMore(.word) } - ZeroOrMore { - .any - } + ZeroOrMore(.any) } // Assert the inferred capture type. @@ -841,7 +837,7 @@ class RegexDSLTests: XCTestCase { let a = Reference(Substring.self) ChoiceOf<(Substring, Substring?)> { Regex { - .word + One(.word) a } Regex { @@ -890,7 +886,7 @@ class RegexDSLTests: XCTestCase { let a = Reference(Substring.self) ChoiceOf<(Substring, Substring?)> { Regex { - .word + One(.word) a } Regex { diff --git a/Tests/RegexTests/AlgorithmsInternalsTests.swift b/Tests/RegexTests/AlgorithmsInternalsTests.swift index f0d556744..31e082bce 100644 --- a/Tests/RegexTests/AlgorithmsInternalsTests.swift +++ b/Tests/RegexTests/AlgorithmsInternalsTests.swift @@ -24,7 +24,7 @@ extension AlgorithmTests { let str = "a string with the letter b in it" let first = str.firstRange(of: r) - let last = str.lastRange(of: r) + let last = str._lastRange(of: r) let (expectFirst, expectLast) = ( str.index(atOffset: 0)..(element: T, count: Int) -> UnfoldSequence } } +struct CountedOptionSet: OptionSet { + static var arrayLiteralCreationCount = 0 + + var rawValue: Int + + static var one = Self(rawValue: 1) + static var two = Self(rawValue: 1) +} + +extension CountedOptionSet { + init(arrayLiteral: Self...) { + Self.arrayLiteralCreationCount += 1 + self.rawValue = 0 + for element in arrayLiteral { + self.insert(element) + } + } +} + class AlgorithmTests: XCTestCase { func testContains() { - XCTAssertTrue("".contains("")) - XCTAssertTrue("abcde".contains("")) + XCTAssertTrue("abcde".contains("a")) + XCTAssertTrue("abcde".contains("e" as Character)) + + XCTExpectFailure { + XCTAssertTrue("".contains("")) + XCTAssertTrue("abcde".contains("")) + } XCTAssertTrue("abcde".contains("abcd")) XCTAssertTrue("abcde".contains("bcde")) XCTAssertTrue("abcde".contains("bcd")) @@ -51,7 +75,36 @@ class AlgorithmTests: XCTestCase { } } - func testRanges() { + func testContainsSourceCompatibility() { + CountedOptionSet.arrayLiteralCreationCount = 0 + + let both: CountedOptionSet = [.one, .two] + let none: CountedOptionSet = [] + XCTAssertEqual(CountedOptionSet.arrayLiteralCreationCount, 2) + + let cosArray = [both, .one, .two] + XCTAssertFalse(cosArray.contains(none)) + + // This tests that `contains([])` uses the element-based `contains(_:)` + // method, interpreting `[]` as an instance of `CountedOptionSet`, rather + // than the collection-based overload, which would interpret `[]` as an + // `Array`. + XCTAssertFalse(cosArray.contains([])) + XCTAssertEqual(CountedOptionSet.arrayLiteralCreationCount, 3) + + // For these references to resolve to the `Element`-based stdlib function, + // the `String`- and `Substring`-based `contains` functions need to be + // marked as `@_disfavoredOverload`. However, that means that Foundation's + // `String.contains` get selected instead, which has inconsistent behavior. + + // Test that original `contains` functions are still accessible + let containsRef = "abcd".contains + XCTAssert(type(of: containsRef) == ((Character) -> Bool).self) + let containsParamsRef = "abcd".contains(_:) + XCTAssert(type(of: containsParamsRef) == ((Character) -> Bool).self) + } + + func testRegexRanges() { func expectRanges( _ string: String, _ regex: String, @@ -67,6 +120,9 @@ class AlgorithmTests: XCTestCase { let actualCol: [Range] = string[...].ranges(of: regex)[...].map(string.offsets(of:)) XCTAssertEqual(actualCol, expected, file: file, line: line) + let matchRanges = string.matches(of: regex).map { string.offsets(of: $0.range) } + XCTAssertEqual(matchRanges, expected, file: file, line: line) + let firstRange = string.firstRange(of: regex).map(string.offsets(of:)) XCTAssertEqual(firstRange, expected.first, file: file, line: line) } @@ -75,6 +131,7 @@ class AlgorithmTests: XCTestCase { expectRanges("", "x", []) expectRanges("", "x+", []) expectRanges("", "x*", [0..<0]) + expectRanges("aaa", "a*", [0..<3, 3..<3]) expectRanges("abc", "", [0..<0, 1..<1, 2..<2, 3..<3]) expectRanges("abc", "x", []) expectRanges("abc", "x+", []) @@ -89,8 +146,10 @@ class AlgorithmTests: XCTestCase { expectRanges("abc", "(a|b)*", [0..<2, 2..<2, 3..<3]) expectRanges("abc", "(b|c)+", [1..<3]) expectRanges("abc", "(b|c)*", [0..<0, 1..<3, 3..<3]) - - func expectStringRanges( + } + + func testStringRanges() { + func expectRanges( _ input: String, _ pattern: String, _ expected: [Range], @@ -107,16 +166,16 @@ class AlgorithmTests: XCTestCase { XCTAssertEqual(firstRange, expected.first, file: file, line: line) } - expectStringRanges("", "", [0..<0]) - expectStringRanges("abcde", "", [0..<0, 1..<1, 2..<2, 3..<3, 4..<4, 5..<5]) - expectStringRanges("abcde", "abcd", [0..<4]) - expectStringRanges("abcde", "bcde", [1..<5]) - expectStringRanges("abcde", "bcd", [1..<4]) - expectStringRanges("ababacabababa", "abababa", [6..<13]) - expectStringRanges("ababacabababa", "aba", [0..<3, 6..<9, 10..<13]) + expectRanges("", "", [0..<0]) + expectRanges("abcde", "", [0..<0, 1..<1, 2..<2, 3..<3, 4..<4, 5..<5]) + expectRanges("abcde", "abcd", [0..<4]) + expectRanges("abcde", "bcde", [1..<5]) + expectRanges("abcde", "bcd", [1..<4]) + expectRanges("ababacabababa", "abababa", [6..<13]) + expectRanges("ababacabababa", "aba", [0..<3, 6..<9, 10..<13]) } - func testSplit() { + func testRegexSplit() { func expectSplit( _ string: String, _ regex: String, @@ -135,9 +194,47 @@ class AlgorithmTests: XCTestCase { expectSplit("a", "a", ["", ""]) expectSplit("a____a____a", "_+", ["a", "a", "a"]) expectSplit("____a____a____a____", "_+", ["", "a", "a", "a", ""]) + } + + func testStringSplit() { + func expectSplit( + _ string: String, + _ separator: String, + _ expected: [Substring], + file: StaticString = #file, line: UInt = #line + ) { + let actual = Array(string.split(separator: separator, omittingEmptySubsequences: false)) + XCTAssertEqual(actual, expected, file: file, line: line) + } + + expectSplit("", "", [""]) + expectSplit("", "x", [""]) + expectSplit("a", "", ["", "a", ""]) + expectSplit("a", "x", ["a"]) + expectSplit("a", "a", ["", ""]) + expectSplit("a__a__a", "_", ["a", "", "a", "", "a"]) + expectSplit("_a_a_a_", "_", ["", "a", "a", "a", ""]) XCTAssertEqual("".split(separator: ""), []) XCTAssertEqual("".split(separator: "", omittingEmptySubsequences: false), [""]) + } + + func testSplitSourceCompatibility() { + CountedOptionSet.arrayLiteralCreationCount = 0 + + let both: CountedOptionSet = [.one, .two] + let none: CountedOptionSet = [] + XCTAssertEqual(CountedOptionSet.arrayLiteralCreationCount, 2) + + let cosArray = [both, .one, .two] + XCTAssertEqual(cosArray.split(separator: none).count, 1) + + // This tests that `contains([])` uses the element-based `contains(_:)` + // method, interpreting `[]` as an instance of `CountedOptionSet`, rather + // than the collection-based overload, which would interpret `[]` as an + // `Array`. + XCTAssertEqual(cosArray.split(separator: []).count, 1) + XCTAssertEqual(CountedOptionSet.arrayLiteralCreationCount, 3) // Test that original `split` functions are still accessible let splitRef = "abcd".split @@ -145,7 +242,7 @@ class AlgorithmTests: XCTestCase { let splitParamsRef = "abcd".split(separator:maxSplits:omittingEmptySubsequences:) XCTAssert(type(of: splitParamsRef) == ((Character, Int, Bool) -> [Substring]).self) } - + func testSplitPermutations() throws { let splitRegex = try Regex(#"\|"#) XCTAssertEqual( @@ -225,7 +322,7 @@ class AlgorithmTests: XCTestCase { } } - func testTrim() { + func testRegexTrim() { func expectTrim( _ string: String, _ regex: String, @@ -235,6 +332,10 @@ class AlgorithmTests: XCTestCase { let regex = try! Regex(regex) let actual = string.trimmingPrefix(regex) XCTAssertEqual(actual, expected, file: file, line: line) + + var actual2 = string + actual2.trimPrefix(regex) + XCTAssertEqual(actual2[...], expected, file: file, line: line) } expectTrim("", "", "") @@ -243,15 +344,54 @@ class AlgorithmTests: XCTestCase { expectTrim("a", "x", "a") expectTrim("___a", "_", "__a") expectTrim("___a", "_+", "a") - - XCTAssertEqual("".trimmingPrefix("a"), "") - XCTAssertEqual("a".trimmingPrefix("a"), "") - XCTAssertEqual("b".trimmingPrefix("a"), "b") - XCTAssertEqual("a".trimmingPrefix(""), "a") - XCTAssertEqual("___a".trimmingPrefix("_"), "__a") - XCTAssertEqual("___a".trimmingPrefix("___"), "a") - XCTAssertEqual("___a".trimmingPrefix("____"), "___a") - XCTAssertEqual("___a".trimmingPrefix("___a"), "") + } + + func testPredicateTrim() { + func expectTrim( + _ string: String, + _ predicate: (Character) -> Bool, + _ expected: Substring, + file: StaticString = #file, line: UInt = #line + ) { + let actual = string.trimmingPrefix(while: predicate) + XCTAssertEqual(actual, expected, file: file, line: line) + + var actual2 = string + actual2.trimPrefix(while: predicate) + XCTAssertEqual(actual2[...], expected, file: file, line: line) + } + + expectTrim("", \.isWhitespace, "") + expectTrim("a", \.isWhitespace, "a") + expectTrim(" ", \.isWhitespace, "") + expectTrim(" a", \.isWhitespace, "a") + expectTrim("a ", \.isWhitespace, "a ") + } + + func testStringTrim() { + func expectTrim( + _ string: String, + _ pattern: String, + _ expected: Substring, + file: StaticString = #file, line: UInt = #line + ) { + let actual = string.trimmingPrefix(pattern) + XCTAssertEqual(actual, expected, file: file, line: line) + + var actual2 = string + actual2.trimPrefix(pattern) + XCTAssertEqual(actual2[...], expected, file: file, line: line) + } + + expectTrim("", "", "") + expectTrim("", "x", "") + expectTrim("a", "", "a") + expectTrim("a", "x", "a") + expectTrim("a", "a", "") + expectTrim("___a", "_", "__a") + expectTrim("___a", "___", "a") + expectTrim("___a", "____", "___a") + expectTrim("___a", "___a", "") do { let prefix = makeSingleUseSequence(element: "_" as Character, count: 5) @@ -265,15 +405,9 @@ class AlgorithmTests: XCTestCase { // is just to test that it doesn't crash. XCTAssertNotEqual("_____a".trimmingPrefix(prefix), "") } - - XCTAssertEqual("".trimmingPrefix(while: \.isWhitespace), "") - XCTAssertEqual("a".trimmingPrefix(while: \.isWhitespace), "a") - XCTAssertEqual(" ".trimmingPrefix(while: \.isWhitespace), "") - XCTAssertEqual(" a".trimmingPrefix(while: \.isWhitespace), "a") - XCTAssertEqual("a ".trimmingPrefix(while: \.isWhitespace), "a ") } - func testReplace() { + func testRegexReplace() { func expectReplace( _ string: String, _ regex: String, @@ -297,8 +431,33 @@ class AlgorithmTests: XCTestCase { expectReplace("aab", "a", "X", "XXb") expectReplace("aab", "a+", "X", "Xb") expectReplace("aab", "a*", "X", "XXbX") + + // FIXME: Test maxReplacements + // FIXME: Test closure-based replacement } + func testStringReplace() { + func expectReplace( + _ string: String, + _ pattern: String, + _ replacement: String, + _ expected: String, + file: StaticString = #file, line: UInt = #line + ) { + let actual = string.replacing(pattern, with: replacement) + XCTAssertEqual(actual, expected, file: file, line: line) + } + + expectReplace("", "", "X", "X") + expectReplace("", "x", "X", "") + expectReplace("a", "", "X", "XaX") + expectReplace("a", "x", "X", "a") + expectReplace("a", "a", "X", "X") + expectReplace("aab", "a", "X", "XXb") + + // FIXME: Test maxReplacements + } + func testSubstring() throws { let s = "aaa | aaaaaa | aaaaaaaaaa" let s1 = s.dropFirst(6) // "aaaaaa | aaaaaaaaaa" @@ -332,6 +491,11 @@ class AlgorithmTests: XCTestCase { XCTAssertEqual( s2.matches(of: regex).map(\.0), ["aa"]) + + XCTAssertEqual( + s2.matches(of: try Regex("a*?")).map { s2.offsets(of: $0.range) }, [0..<0, 1..<1, 2..<2]) + XCTAssertEqual( + s2.ranges(of: try Regex("a*?")).map(s2.offsets(of:)), [0..<0, 1..<1, 2..<2]) } func testSwitches() { diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 9efbf2f76..ac4d8b87c 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -56,20 +56,20 @@ extension CaptureList { } } -extension StructuredCapture { +extension AnyRegexOutput.Element { func formatStringCapture(input: String) -> String { - var res = String(repeating: "some(", count: someCount) - if let r = self.storedCapture?.range { + var res = String(repeating: "some(", count: optionalDepth) + if let r = range { res += input[r] } else { res += "none" } - res += String(repeating: ")", count: someCount) + res += String(repeating: ")", count: optionalDepth) return res } } -extension Sequence where Element == StructuredCapture { +extension AnyRegexOutput { func formatStringCaptures(input: String) -> String { var res = "[" res += self.map { @@ -119,13 +119,13 @@ extension StringCapture: CustomStringConvertible { extension StringCapture { func isEqual( - to structCap: StructuredCapture, + to structCap: AnyRegexOutput.Element, in input: String ) -> Bool { - guard optionalCount == structCap.optionalCount else { + guard optionalCount == structCap.optionalDepth else { return false } - guard let r = structCap.storedCapture?.range else { + guard let r = structCap.range else { return contents == nil } guard let s = contents else { @@ -202,7 +202,7 @@ func captureTest( return } - let caps = result.rawCaptures + let caps = result.anyRegexOutput guard caps.count == output.count else { XCTFail(""" Mismatch capture count: @@ -213,7 +213,7 @@ func captureTest( """) continue } - + guard output.elementsEqual(caps, by: { $0.isEqual(to: $1, in: input) }) else { @@ -459,7 +459,33 @@ extension RegexTests { // TODO: "((a|b)|c)*" } - + + func testTypeVerification() throws { + let opaque1 = try Regex("abc") + _ = try XCTUnwrap(opaque1.as(Substring.self)) + XCTAssertNil(opaque1.as((Substring, Substring).self)) + XCTAssertNil(opaque1.as(Int.self)) + + let opaque2 = try Regex("(abc)") + _ = try XCTUnwrap(opaque2.as((Substring, Substring).self)) + XCTAssertNil(opaque2.as(Substring.self)) + XCTAssertNil(opaque2.as((Substring, Int).self)) + + let opaque3 = try Regex("(?abc)") + _ = try XCTUnwrap(opaque3.as((Substring, someLabel: Substring).self)) + XCTAssertNil(opaque3.as((Substring, Substring).self)) + XCTAssertNil(opaque3.as(Substring.self)) + + let opaque4 = try Regex("(?abc)?") + _ = try XCTUnwrap(opaque4.as((Substring, somethingHere: Substring?).self)) + XCTAssertNil(opaque4.as((Substring, somethignHere: Substring).self)) + XCTAssertNil(opaque4.as((Substring, Substring?).self)) + + let opaque5 = try Regex("((a)?bc)?") + _ = try XCTUnwrap(opaque5.as((Substring, Substring?, Substring??).self)) + XCTAssertNil(opaque5.as((Substring, somethingHere: Substring?, here: Substring??).self)) + XCTAssertNil(opaque5.as((Substring, Substring?, Substring?).self)) + } } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index eab46dca0..9e94a886a 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -88,4 +88,51 @@ extension RegexTests { try testCompilationEquivalence(row) } } + + func testCompileInitialOptions() throws { + func expectInitialOptions( + _ regex: Regex, + _ optionSequence: AST.MatchingOptionSequence, + file: StaticString = #file, + line: UInt = #line + ) throws { + var options = MatchingOptions() + options.apply(optionSequence) + + XCTAssertTrue( + regex.program.loweredProgram.initialOptions._equal(to: options), + file: file, line: line) + } + + func expectInitialOptions( + _ pattern: String, + _ optionSequence: AST.MatchingOptionSequence, + file: StaticString = #file, + line: UInt = #line + ) throws { + let regex = try Regex(pattern) + try expectInitialOptions(regex, optionSequence, file: file, line: line) + } + + try expectInitialOptions(".", matchingOptions()) + try expectInitialOptions("(?i)(?-i).", matchingOptions()) + + try expectInitialOptions("(?i).", matchingOptions(adding: [.caseInsensitive])) + try expectInitialOptions("(?i).(?-i)", matchingOptions(adding: [.caseInsensitive])) + + try expectInitialOptions( + "(?im)(?s).", + matchingOptions(adding: [.caseInsensitive, .multiline, .singleLine])) + try expectInitialOptions(".", matchingOptions()) + try expectInitialOptions( + "(?im)(?s).(?u)", + matchingOptions(adding: [.caseInsensitive, .multiline, .singleLine])) + + try expectInitialOptions( + "(?i:.)", + matchingOptions(adding: [.caseInsensitive])) + try expectInitialOptions( + "(?i:.)(?m:.)", + matchingOptions(adding: [.caseInsensitive])) + } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 36056e85a..ed16905b8 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -35,7 +35,7 @@ extension Executor { in: start...)(?P=a1)"#, + input: "aaaaaaaaabbc", match: "aaaaaaaaabb") + firstMatchTest( #"(.)\g001"#, input: "112", match: "11") - firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true) - firstMatchTest(#"(?.)(.)\k"#, input: "abac", match: "aba", xfail: true) - firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true) + firstMatchTest(#"(?.)(.)\k"#, input: "abac", match: "aba") + + firstMatchTest(#"(?.)(?.)(?.)\k\k\k"#, + input: "xyzzxy", match: "xyzzxy") firstMatchTest(#"\1(.)"#, input: "112", match: nil) + firstMatchTest(#"\k(?.)"#, input: "112", match: nil) + + // TODO: Implement subpattern matching. + firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true) + firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true) } func testMatchExamples() { @@ -1282,7 +1305,6 @@ extension RegexTests { firstMatchTest(#"(?xx)[ \t]+"#, input: " \t \t", match: "\t") firstMatchTest("(?xx)[ a && ab ]+", input: " aaba ", match: "aa") - firstMatchTest("(?xx)[ ] a ]+", input: " a]]a ] ", match: "a]]a") } func testASCIIClasses() { @@ -1613,5 +1635,15 @@ extension RegexTests { // TODO: Add test for grapheme boundaries at start/end of match + func testCase() { + let regex = try! Regex(#".\N{SPARKLING HEART}."#) + let input = "🧟‍♀️💖🧠 or 🧠💖☕️" + let characterMatches = input.matches(of: regex) + XCTAssertEqual(characterMatches.map { $0.0 }, ["🧟‍♀️💖🧠", "🧠💖☕️"]) + + let scalarMatches = input.matches(of: regex.matchingSemantics(.unicodeScalar)) + let scalarExpected: [Substring] = ["\u{FE0F}💖🧠", "🧠💖☕"] + XCTAssertEqual(scalarMatches.map { $0.0 }, scalarExpected) + } } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index ed930b0fe..20067ac20 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -34,7 +34,7 @@ extension AST.CustomCharacterClass.Member: ExpressibleByExtendedGraphemeClusterL } enum SemanticErrorKind { - case unsupported, invalid + case unsupported, invalid, unchecked } class RegexTests: XCTestCase {} @@ -68,7 +68,7 @@ func parseTest( XCTFail("unexpected error: \(error)", file: file, line: line) return } - if let errorKind = errorKind { + if let errorKind = errorKind, errorKind != .unchecked { do { _ = try parse(input, .semantic, syntax) XCTFail("expected semantically invalid AST", file: file, line: line) @@ -394,6 +394,12 @@ extension RegexTests { #"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) + // MARK: Allowed combining characters + + parseTest("e\u{301}", "e\u{301}") + parseTest("1\u{358}", "1\u{358}") + parseTest(#"\ \#u{361}"#, " \u{361}") + // MARK: Alternations parseTest( @@ -466,14 +472,6 @@ extension RegexTests { parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8")) parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}"))) - // TODO: These are treated as octal sequences by PCRE, we should warn and - // suggest user prefix with 0. - parseTest(#"[\1]"#, charClass("1")) - parseTest(#"[\123]"#, charClass("1", "2", "3")) - parseTest(#"[\101]"#, charClass("1", "0", "1")) - parseTest(#"[\7777]"#, charClass("7", "7", "7", "7")) - parseTest(#"[\181]"#, charClass("1", "8", "1")) - // We take *up to* the first two valid digits for \x. No valid digits is 0. parseTest(#"\x"#, scalar("\u{0}")) parseTest(#"\x5"#, scalar("\u{5}")) @@ -484,6 +482,8 @@ extension RegexTests { parseTest(#"\u{ a }"#, scalar("\u{A}")) parseTest(#"\u{ a }\u{ B }"#, concat(scalar("\u{A}"), scalar("\u{B}"))) + parseTest(#"[\u{301}]"#, charClass(scalar_m("\u{301}"))) + // MARK: Scalar sequences parseTest(#"\u{A bC}"#, scalarSeq("\u{A}", "\u{BC}")) @@ -517,34 +517,19 @@ extension RegexTests { "[a-b-c]", charClass(range_m("a", "b"), "-", "c")) parseTest("[-a-]", charClass("-", "a", "-")) + parseTest("[[a]-]", charClass(charClass("a"), "-")) + parseTest("[[a]-b]", charClass(charClass("a"), "-", "b")) parseTest("[a-z]", charClass(range_m("a", "z"))) parseTest("[a-a]", charClass(range_m("a", "a"))) parseTest("[B-a]", charClass(range_m("B", "a"))) - // FIXME: AST builder helpers for custom char class types parseTest("[a-d--a-c]", charClass( - .setOperation([range_m("a", "d")], .init(faking: .subtraction), [range_m("a", "c")]) + setOp(range_m("a", "d"), op: .subtraction, range_m("a", "c")) )) parseTest("[-]", charClass("-")) - - // Empty character classes are forbidden, therefore these are character - // classes containing literal ']'. - parseTest("[]]", charClass("]")) - parseTest("[]a]", charClass("]", "a")) - parseTest("(?x)[ ]]", concat( - changeMatchingOptions(matchingOptions(adding: .extended)), - charClass("]") - )) - parseTest("(?x)[ ] ]", concat( - changeMatchingOptions(matchingOptions(adding: .extended)), - charClass("]") - )) - parseTest("(?x)[ ] a ]", concat( - changeMatchingOptions(matchingOptions(adding: .extended)), - charClass("]", "a") - )) + parseTest(#"[\]]"#, charClass("]")) // These are metacharacters in certain contexts, but normal characters // otherwise. @@ -696,34 +681,45 @@ extension RegexTests { throwsError: .unsupported ) + parseTest(#"(?x)[ a - b ]"#, concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(range_m("a", "b")) + )) + + parseTest(#"(?x)[a - b]"#, concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(range_m("a", "b")) + )) + // MARK: Operators parseTest( #"[a[bc]de&&[^bc]\d]+"#, - oneOrMore(of: charClass( - .setOperation( - ["a", charClass("b", "c"), "d", "e"], - .init(faking: .intersection), - [charClass("b", "c", inverted: true), atom_m(.escaped(.decimalDigit))] - )))) + oneOrMore(of: charClass(setOp( + "a", charClass("b", "c"), "d", "e", + op: .intersection, + charClass("b", "c", inverted: true), atom_m(.escaped(.decimalDigit)) + ))) + ) parseTest( - "[a&&b]", - charClass( - .setOperation(["a"], .init(faking: .intersection), ["b"]))) + "[a&&b]", charClass(setOp("a", op: .intersection, "b")) + ) parseTest( "[abc--def]", - charClass(.setOperation(["a", "b", "c"], .init(faking: .subtraction), ["d", "e", "f"]))) + charClass(setOp("a", "b", "c", op: .subtraction, "d", "e", "f")) + ) // We left-associate for chained operators. parseTest( "[ab&&b~~cd]", - charClass( - .setOperation( - [.setOperation(["a", "b"], .init(faking: .intersection), ["b"])], - .init(faking: .symmetricDifference), - ["c", "d"]))) + charClass(setOp( + setOp("a", "b", op: .intersection, "b"), + op: .symmetricDifference, + "c", "d" + )) + ) // Operators are only valid in custom character classes. parseTest( @@ -739,11 +735,11 @@ extension RegexTests { parseTest( "[ && ]", - charClass(.setOperation([" "], .init(faking: .intersection), [" ", " "])) + charClass(setOp(" ", op: .intersection, " ", " ")) ) parseTest("(?x)[ a && b ]", concat( changeMatchingOptions(matchingOptions(adding: .extended)), - charClass(.setOperation(["a"], .init(faking: .intersection), ["b"])) + charClass(setOp("a", op: .intersection, "b")) )) // MARK: Quotes @@ -804,6 +800,20 @@ extension RegexTests { #"a(?#. comment)b"#, concat("a", "b")) + // MARK: Interpolation + + // These are literal as there's no closing '}>' + parseTest("<{", concat("<", "{")) + parseTest("<{a", concat("<", "{", "a")) + parseTest("<{a}", concat("<", "{", "a", "}")) + parseTest("<{<{}", concat("<", "{", "<", "{", "}")) + + // Literal as escaped + parseTest(#"\<{}>"#, concat("<", "{", "}", ">")) + + // A quantification + parseTest(#"<{2}"#, exactly(2, of: "<")) + // MARK: Quantification parseTest("a*", zeroOrMore(of: "a")) @@ -834,6 +844,10 @@ extension RegexTests { #"a{1,1}"#, quantRange(1...1, of: "a")) + parseTest("x{3, 5}", quantRange(3 ... 5, of: "x")) + parseTest("x{ 3 , 5 }", quantRange(3 ... 5, of: "x")) + parseTest("x{3 }", exactly(3, of: "x")) + // Make sure ranges get treated as literal if invalid. parseTest("{", "{") parseTest("{,", concat("{", ",")) @@ -853,11 +867,6 @@ extension RegexTests { parseTest("x{+", concat("x", oneOrMore(of: "{"))) parseTest("x{6,+", concat("x", "{", "6", oneOrMore(of: ","))) - // TODO: We should emit a diagnostic for this. - parseTest("x{3, 5}", concat("x", "{", "3", ",", " ", "5", "}")) - parseTest("{3, 5}", concat("{", "3", ",", " ", "5", "}")) - parseTest("{3 }", concat("{", "3", " ", "}")) - // MARK: Groups // Named captures @@ -1233,16 +1242,37 @@ extension RegexTests { parseTest(#"\k'-3'"#, backreference(.relative(-3)), throwsError: .unsupported) parseTest(#"\k'1'"#, backreference(.absolute(1)), throwsError: .invalid) - parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .unsupported) - parseTest(#"\k"#, backreference(.named("bc")), throwsError: .unsupported) - parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .unsupported) - parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .unsupported) + parseTest( + #"(?)\k"#, concat( + namedCapture("a", empty()), backreference(.named("a")) + ), captures: [.named("a")] + ) + parseTest( + #"(?)\k{a}"#, concat( + namedCapture("a", empty()), backreference(.named("a")) + ), captures: [.named("a")] + ) + parseTest( + #"(?)\g{a}"#, concat( + namedCapture("a", empty()), backreference(.named("a")) + ), captures: [.named("a")] + ) + parseTest( + #"(?)(?P=a)"#, concat( + namedCapture("a", empty()), backreference(.named("a")) + ), captures: [.named("a")] + ) + + parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .invalid) + parseTest(#"\k"#, backreference(.named("bc")), throwsError: .invalid) + parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .invalid) + parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .invalid) // Oniguruma recursion levels. parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0), throwsError: .unsupported) parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0), throwsError: .unsupported) - parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .invalid) - parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .invalid) + parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .unsupported) + parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .unsupported) parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8), throwsError: .unsupported) parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), throwsError: .unsupported) parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8), throwsError: .unsupported) @@ -1267,10 +1297,6 @@ extension RegexTests { parseTest(#"\g'+30'"#, subpattern(.relative(30)), throwsError: .unsupported) parseTest(#"\g'abc'"#, subpattern(.named("abc")), throwsError: .unsupported) - // Backreferences are not valid in custom character classes. - parseTest(#"[\8]"#, charClass("8")) - parseTest(#"[\9]"#, charClass("9")) - // These are valid references. parseTest(#"()\1"#, concat( capture(empty()), backreference(.absolute(1)) @@ -1358,7 +1384,60 @@ extension RegexTests { parseTest(#"\p{isAlphabetic}"#, prop(.binary(.alphabetic))) parseTest(#"\p{isAlpha=isFalse}"#, prop(.binary(.alphabetic, value: false))) - parseTest(#"\p{In_Runic}"#, prop(.onigurumaSpecial(.inRunic)), throwsError: .unsupported) + parseTest(#"\p{In_Runic}"#, prop(.block(.runic)), throwsError: .unsupported) + + parseTest(#"\p{Hebrew}"#, prop(.scriptExtension(.hebrew))) + parseTest(#"\p{Is_Hebrew}"#, prop(.scriptExtension(.hebrew))) + parseTest(#"\p{In_Hebrew}"#, prop(.block(.hebrew)), throwsError: .unsupported) + parseTest(#"\p{Blk=Is_Hebrew}"#, prop(.block(.hebrew)), throwsError: .unsupported) + + // These are the shorthand properties with an "in" prefix we currently + // recognize. Make sure they don't clash with block properties. + parseTest(#"\p{initialpunctuation}"#, prop(.generalCategory(.initialPunctuation))) + parseTest(#"\p{inscriptionalpahlavi}"#, prop(.scriptExtension(.inscriptionalPahlavi))) + parseTest(#"\p{inscriptionalparthian}"#, prop(.scriptExtension(.inscriptionalParthian))) + parseTest(#"\p{inherited}"#, prop(.scriptExtension(.inherited))) + + // Make sure these are round-trippable. + for s in Unicode.Script.allCases { + parseTest(#"\p{\#(s.rawValue)}"#, prop(.scriptExtension(s))) + parseTest(#"\p{is\#(s.rawValue)}"#, prop(.scriptExtension(s))) + } + for g in Unicode.ExtendedGeneralCategory.allCases { + parseTest(#"\p{\#(g.rawValue)}"#, prop(.generalCategory(g))) + parseTest(#"\p{is\#(g.rawValue)}"#, prop(.generalCategory(g))) + } + for p in Unicode.POSIXProperty.allCases { + parseTest(#"\p{\#(p.rawValue)}"#, prop(.posix(p))) + parseTest(#"\p{is\#(p.rawValue)}"#, prop(.posix(p))) + } + for b in Unicode.BinaryProperty.allCases { + // Some of these are unsupported, so don't check for semantic errors. + parseTest(#"\p{\#(b.rawValue)}"#, prop(.binary(b, value: true)), throwsError: .unchecked) + parseTest(#"\p{is\#(b.rawValue)}"#, prop(.binary(b, value: true)), throwsError: .unchecked) + } + + for j in AST.Atom.CharacterProperty.JavaSpecial.allCases { + parseTest(#"\p{\#(j.rawValue)}"#, prop(.javaSpecial(j)), throwsError: .unsupported) + } + + // Try prefixing each block property with "in" to make sure we don't stomp + // on any other property shorthands. + for b in Unicode.Block.allCases { + parseTest(#"\p{in\#(b.rawValue)}"#, prop(.block(b)), throwsError: .unsupported) + } + + parseTest(#"\p{ASCII}"#, prop(.ascii)) + parseTest(#"\p{isASCII}"#, prop(.ascii)) + parseTest(#"\p{inASCII}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + + parseTest(#"\p{inBasicLatin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + parseTest(#"\p{In_Basic_Latin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + parseTest(#"\p{Blk=Basic_Latin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + parseTest(#"\p{Blk=Is_Basic_Latin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + + parseTest(#"\p{isAny}"#, prop(.any)) + parseTest(#"\p{isAssigned}"#, prop(.assigned)) parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric)), throwsError: .unsupported) parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace)), throwsError: .unsupported) @@ -1654,6 +1733,9 @@ extension RegexTests { parseTest("[(?#abc)]", charClass("(", "?", "#", "a", "b", "c", ")")) parseTest("# abc", concat("#", " ", "a", "b", "c")) + parseTest("(?#)", empty()) + parseTest("/**/", empty(), syntax: .experimental) + // MARK: Matching option changing parseTest( @@ -1681,12 +1763,8 @@ extension RegexTests { ) ) - // End of line comments aren't applicable in custom char classes. - // TODO: ICU supports this. - parseTest("(?x)[ # abc]", concat( - changeMatchingOptions(matchingOptions(adding: .extended)), - charClass("#", "a", "b", "c") - )) + parseTest("[ # abc]", charClass(" ", "#", " ", "a", "b", "c")) + parseTest("[#]", charClass("#")) parseTest("(?x)a b c[d e f]", concat( changeMatchingOptions(matchingOptions(adding: .extended)), @@ -1774,10 +1852,10 @@ extension RegexTests { // PCRE states that whitespace won't be ignored within a range. // http://pcre.org/current/doc/html/pcre2api.html#SEC20 - // TODO: We ought to warn on this, and produce a range anyway. + // We however do ignore it. parseTest("(?x)a{1, 3}", concat( changeMatchingOptions(matchingOptions(adding: .extended)), - "a", "{", "1", ",", "3", "}" + quantRange(1 ... 3, of: "a") )) // Test that we cover the list of whitespace characters covered by PCRE. @@ -2119,6 +2197,26 @@ extension RegexTests { /# """#, scalarSeq("\u{AB}", "\u{B}", "\u{C}")) + parseWithDelimitersTest(#""" + #/ + [ + a # interesting + b-c #a + d] + /# + """#, charClass("a", range_m("b", "c"), "d")) + + parseWithDelimitersTest(#""" + #/ + [ + a # interesting + - #a + b + ] + /# + """#, charClass(range_m("a", "b"))) + + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax. @@ -2143,7 +2241,7 @@ extension RegexTests { throwsError: .unsupported ) parseWithDelimitersTest( - #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .unsupported) + #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalid) parseWithDelimitersTest( #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1), throwsError: .unsupported @@ -2216,6 +2314,8 @@ extension RegexTests { parseNotEqualTest(#"[abc]"#, #"[a b c]"#) + parseNotEqualTest("[abc]", "[^abc]") + parseNotEqualTest(#"\1"#, #"\10"#) parseNotEqualTest("(?^:)", ("(?-:)")) @@ -2463,6 +2563,20 @@ extension RegexTests { diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b")) diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa")) diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b")) + diagnosticTest(#"\p{script=Not_A_Script}"#, .unrecognizedScript("Not_A_Script")) + diagnosticTest(#"\p{scx=Not_A_Script}"#, .unrecognizedScript("Not_A_Script")) + diagnosticTest(#"\p{gc=Not_A_Category}"#, .unrecognizedCategory("Not_A_Category")) + diagnosticTest(#"\p{age=3}"#, .invalidAge("3")) + diagnosticTest(#"\p{age=V3}"#, .invalidAge("V3")) + diagnosticTest(#"\p{age=3.0.1}"#, .invalidAge("3.0.1")) + diagnosticTest(#"\p{nv=A}"#, .invalidNumericValue("A")) + diagnosticTest(#"\p{Numeric_Value=1.2.3.4}"#, .invalidNumericValue("1.2.3.4")) + diagnosticTest(#"\p{nt=Not_A_NumericType}"#, .unrecognizedNumericType("Not_A_NumericType")) + diagnosticTest(#"\p{Numeric_Type=Nuemric}"#, .unrecognizedNumericType("Nuemric")) + diagnosticTest(#"\p{Simple_Lowercase_Mapping}"#, .unknownProperty(key: nil, value: "Simple_Lowercase_Mapping")) + diagnosticTest(#"\p{Simple_Lowercase_Mapping=}"#, .emptyProperty) + diagnosticTest(#"\p{ccc=255}"#, .invalidCCC("255")) + diagnosticTest(#"\p{ccc=Nada}"#, .invalidCCC("Nada")) diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) @@ -2497,10 +2611,15 @@ extension RegexTests { diagnosticTest("[a", .expected("]")) - // The first ']' of a custom character class is literal, so these are - // missing the closing bracket. - diagnosticTest("[]", .expected("]")) - diagnosticTest("(?x)[ ]", .expected("]")) + // Character classes may not be empty. + diagnosticTest("[]", .expectedCustomCharacterClassMembers) + diagnosticTest("[]]", .expectedCustomCharacterClassMembers) + diagnosticTest("[]a]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ ]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ ] ]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ ] a ]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?xx)[ ] a ]+", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ ]]", .expectedCustomCharacterClassMembers) diagnosticTest("[&&]", .expectedCustomCharacterClassMembers) diagnosticTest("[a&&]", .expectedCustomCharacterClassMembers) @@ -2527,6 +2646,12 @@ extension RegexTests { diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}")) + diagnosticTest("(?x)[(?#)]", .expected("]")) + diagnosticTest("(?x)[(?#abc)]", .expected("]")) + + diagnosticTest("(?x)[#]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ # abc]", .expectedCustomCharacterClassMembers) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape) @@ -2547,6 +2672,17 @@ extension RegexTests { // TODO: Custom diagnostic for missing '\Q' diagnosticTest(#"\E"#, .invalidEscape("E")) + // PCRE treats these as octal, but we require a `0` prefix. + diagnosticTest(#"[\1]"#, .invalidEscape("1")) + diagnosticTest(#"[\123]"#, .invalidEscape("1")) + diagnosticTest(#"[\101]"#, .invalidEscape("1")) + diagnosticTest(#"[\7777]"#, .invalidEscape("7")) + diagnosticTest(#"[\181]"#, .invalidEscape("1")) + + // Backreferences are not valid in custom character classes. + diagnosticTest(#"[\8]"#, .invalidEscape("8")) + diagnosticTest(#"[\9]"#, .invalidEscape("9")) + // Non-ASCII non-whitespace cases. diagnosticTest(#"\🔥"#, .invalidEscape("🔥")) diagnosticTest(#"\🇩🇰"#, .invalidEscape("🇩🇰")) @@ -2555,6 +2691,27 @@ extension RegexTests { diagnosticTest(#"\˂"#, .invalidEscape("˂")) diagnosticTest(#"\d\#u{301}"#, .invalidEscape("d\u{301}")) + // MARK: Confusable characters + + diagnosticTest("[\u{301}]", .confusableCharacter("[\u{301}")) + diagnosticTest("(\u{358})", .confusableCharacter("(\u{358}")) + diagnosticTest("{\u{35B}}", .confusableCharacter("{\u{35B}")) + diagnosticTest(#"\\#u{35C}"#, .confusableCharacter(#"\\#u{35C}"#)) + diagnosticTest("^\u{35D}", .confusableCharacter("^\u{35D}")) + diagnosticTest("$\u{35E}", .confusableCharacter("$\u{35E}")) + diagnosticTest(".\u{35F}", .confusableCharacter(".\u{35F}")) + diagnosticTest("|\u{360}", .confusableCharacter("|\u{360}")) + diagnosticTest(" \u{361}", .confusableCharacter(" \u{361}")) + + // MARK: Interpolation (currently unsupported) + + diagnosticTest("<{}>", .unsupported("interpolation")) + diagnosticTest("<{...}>", .unsupported("interpolation")) + diagnosticTest("<{)}>", .unsupported("interpolation")) + diagnosticTest("<{}}>", .unsupported("interpolation")) + diagnosticTest("<{<{}>", .unsupported("interpolation")) + diagnosticTest("(<{)}>", .unsupported("interpolation")) + // MARK: Character properties diagnosticTest(#"\p{Lx}"#, .unknownProperty(key: nil, value: "Lx")) @@ -2565,6 +2722,9 @@ extension RegexTests { diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa")) diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) + diagnosticTest(#"\p{Basic_Latin}"#, .unknownProperty(key: nil, value: "Basic_Latin")) + diagnosticTest(#"\p{Blk=In_Basic_Latin}"#, .unrecognizedBlock("In_Basic_Latin")) + // We only filter pattern whitespace, which doesn't include things like // non-breaking spaces. diagnosticTest(#"\p{L\#u{A0}l}"#, .unknownProperty(key: nil, value: "L\u{A0}l")) @@ -2645,6 +2805,9 @@ extension RegexTests { diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}")) diagnosticTest("a{3,2}", .invalidQuantifierRange(3, 2)) + diagnosticTest("{3, 5}", .quantifierRequiresOperand("{3, 5}")) + diagnosticTest("{3 }", .quantifierRequiresOperand("{3 }")) + // These are not quantifiable. diagnosticTest(#"\b?"#, .notQuantifiable) diagnosticTest(#"\B*"#, .notQuantifiable) @@ -2725,6 +2888,12 @@ extension RegexTests { diagnosticTest(#"(?:)()\2"#, .invalidReference(2)) diagnosticTest(#"(?:)(?:)\2"#, .invalidReference(2)) + diagnosticTest(#"\k"#, .invalidNamedReference("a")) + diagnosticTest(#"(?:)\k"#, .invalidNamedReference("a")) + diagnosticTest(#"()\k"#, .invalidNamedReference("a")) + diagnosticTest(#"()\k()"#, .invalidNamedReference("a")) + diagnosticTest(#"(?)\k()"#, .invalidNamedReference("a")) + // MARK: Conditionals diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3)) diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift new file mode 100644 index 000000000..4d244e2cc --- /dev/null +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -0,0 +1,117 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import XCTest +import _RegexParser +@_spi(PatternConverter) @testable +import _StringProcessing + +class RenderDSLTests: XCTestCase {} + +func testConversion( + _ regex: String, + _ expectedDSL: String, + file: StaticString = #file, line: UInt = #line +) throws { + let ast = try _RegexParser.parse(regex, .semantic, .traditional) + let actualDSL = renderAsBuilderDSL(ast: ast)._trimmingSuffix(while: \.isWhitespace) + XCTAssertEqual(actualDSL, expectedDSL[...], file: file, line: line) +} + +extension RenderDSLTests { + func testSimpleConversions() throws { + try testConversion(#"ab+c"#, """ + Regex { + "a" + OneOrMore { + "b" + } + "c" + } + """) + + try testConversion(#"(?:a*)b?(c+)"#, """ + Regex { + ZeroOrMore { + "a" + } + Optionally { + "b" + } + Capture { + OneOrMore { + "c" + } + } + } + """) + + try testConversion(#"\d+"#, """ + Regex { + OneOrMore { + .digit + } + } + """) + try XCTExpectFailure("Invalid leading dot syntax in non-initial position") { + try testConversion(#":\d:"#, """ + Regex { + ":" + CharacterClass.digit + ":" + } + """) + } + } + + func testOptions() throws { + try XCTExpectFailure("Options like '(?i)' aren't converted") { + try testConversion(#"(?i)abc"#, """ + Regex { + "abc" + }.ignoresCase() + """) + } + + try XCTExpectFailure("Options like '(?i:...)' aren't converted") { + try testConversion(#"(?i:abc)"#, """ + Regex { + "abc" + }.ignoresCase() + """) + } + } + + func testAlternations() throws { + try testConversion(#"a|b"#, """ + Regex { + ChoiceOf { + "a" + "b" + } + } + """) + + try XCTExpectFailure("Concatenations in alternations aren't grouped") { + try testConversion(#"\da|b"#, """ + Regex { + ChoiceOf { + Regex { + .digit + "a" + } + "bc" + } + } + """) + } + } +} diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index d13b47b8d..bf41c5f2e 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -62,7 +62,7 @@ fileprivate func expectFirstMatch( } #if os(Linux) -func XCTExpectFailure(_ message: String? = nil, body: () -> Void) {} +func XCTExpectFailure(_ message: String? = nil, body: () throws -> Void) rethrows {} #endif // MARK: - Basic Unicode Support: Level 1 @@ -78,6 +78,9 @@ extension UTS18Tests { func testHexNotation() { expectFirstMatch("ab", regex(#"\u{61}\u{62}"#), "ab") expectFirstMatch("𝄞", regex(#"\u{1D11E}"#), "𝄞") + expectFirstMatch("\n", regex(#"\u{0A}"#), "\n") + expectFirstMatch("\r", regex(#"\u{0D}"#), "\r") + expectFirstMatch("\r\n", regex(#"\u{0D}\u{0A}"#), "\r\n") } // 1.1.1 Hex Notation and Normalization @@ -148,12 +151,8 @@ extension UTS18Tests { } func testProperties_XFail() { - XCTExpectFailure("Need to support 'age' and 'block' properties") { - // XCTAssertFalse("z".contains(#/\p{age=3.1}/#)) - XCTFail(#"\(#/\p{age=3.1}/#)"#) - // XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#)) - XCTFail(#"\(#/\p{Block=Greek}/#)"#) - } + // Certain properties are unsupported, see below. + XCTAssertThrowsError(try Regex(#"\p{Block=Greek}"#)) } // RL1.2a Compatibility Properties @@ -171,11 +170,16 @@ extension UTS18Tests { expectFirstMatch(input, regex(#"[[:xdigit:]]+"#), input[pos: ..<6]) expectFirstMatch(input, regex(#"[[:alnum:]]+"#), input[pos: ..<11]) expectFirstMatch(input, regex(#"[[:space:]]+"#), input[pos: 12..<13]) - // TODO: blank - // TODO: cntrl expectFirstMatch(input, regex(#"[[:graph:]]+"#), input[pos: ..<11]) expectFirstMatch(input, regex(#"[[:print:]]+"#), input[...]) expectFirstMatch(input, regex(#"[[:word:]]+"#), input[pos: ..<11]) + + let blankAndControl = """ + \t\u{01}\u{19} + """ + // \t - tab is in both [:blank:] and [:cntrl:] + expectFirstMatch(blankAndControl, regex(#"[[:blank:]]+"#), blankAndControl[pos: ..<2]) + expectFirstMatch(blankAndControl, regex(#"[[:cntrl:]]+"#), blankAndControl[pos: 1...]) } //RL1.3 Subtraction and Intersection @@ -196,7 +200,7 @@ extension UTS18Tests { // Non-ASCII lowercase + non-lowercase ASCII expectFirstMatch(input, regex(#"[\p{lowercase}~~\p{ascii}]+"#), input[pos: ..<3]) - XCTAssertTrue("123%&^ABC".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#))) + XCTAssertTrue("123%&^ABCDéîøü".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#))) } func testSubtractionAndIntersectionPrecedence() { @@ -380,12 +384,15 @@ extension UTS18Tests { XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#))) XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#).matchingSemantics(.unicodeScalar))) XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.+\y"#).matchingSemantics(.unicodeScalar))) + XCTAssertFalse("abcdef🇬🇭".contains(regex(#"abcdef.$"#).matchingSemantics(.unicodeScalar))) } func testCharacterClassesWithStrings() { let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#) XCTAssertTrue("🧐".contains(regex)) XCTAssertTrue("🇧🇫".contains(regex)) + XCTAssertTrue("🧐".contains(regex.matchingSemantics(.unicodeScalar))) + XCTAssertTrue("🇧🇫".contains(regex.matchingSemantics(.unicodeScalar))) } // RL2.3 Default Word Boundaries @@ -468,7 +475,7 @@ extension UTS18Tests { // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)) } - XCTExpectFailure("Other named char failures -- investigate") { + XCTExpectFailure("Other named char failures -- name aliases") { XCTAssertTrue("\u{C}".contains(regex(#"\N{FORM FEED}"#))) XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BYTE ORDER MARK}"#))) XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BOM}"#))) @@ -486,7 +493,8 @@ extension UTS18Tests { // To meet this requirement, an implementation shall support wildcards in // Unicode property values. func testWildcardsInPropertyValues() { - XCTExpectFailure { XCTFail("Implement tests") } + // Unsupported + XCTAssertThrowsError(try Regex(#"\p{name=/a/"#)) } // RL2.7 Full Properties @@ -498,121 +506,462 @@ extension UTS18Tests { func testFullProperties() { // MARK: General // Name (Name_Alias) + XCTAssertTrue("a".contains(regex(#"\p{name=latin small letter a}"#))) + // Block + // Unsupported + // Age + XCTAssertTrue("a".contains(regex(#"\p{age=1.1}"#))) + XCTAssertTrue("a".contains(regex(#"\p{age=V1_1}"#))) + XCTAssertTrue("a".contains(regex(#"\p{age=14.0}"#))) + XCTAssertTrue("a".contains(regex(#"\p{age=V99_99}"#))) + + XCTAssertTrue("🥱".contains(regex(#"\p{age=12.0}"#))) + XCTAssertFalse("🥱".contains(regex(#"\p{age=11.0}"#))) + + XCTAssertTrue("⌁".contains(regex(#"\p{age=3.0}"#))) + XCTAssertFalse("⌁".contains(regex(#"\p{age=2.0}"#))) + XCTAssertTrue("⌁".contains(regex(#"[\p{age=3.0}--\p{age=2.0}]"#))) + // General_Category + XCTAssertTrue("a".contains(regex(#"\p{Ll}"#))) + XCTAssertTrue("a".contains(regex(#"\p{gc=Ll}"#))) + XCTAssertTrue("a".contains(regex(#"\p{gc=Ll}"#))) + XCTAssertFalse("A".contains(regex(#"\p{gc=Ll}"#))) + XCTAssertTrue("A".contains(regex(#"\p{gc=L}"#))) + + XCTAssertTrue("a".contains(regex(#"\p{Any}"#))) + XCTAssertTrue("a".contains(regex(#"\p{Assigned}"#))) + XCTAssertTrue("a".contains(regex(#"\p{ASCII}"#))) + // Script (Script_Extensions) + XCTAssertTrue("a".contains(regex(#"\p{script=latin}"#))) + XCTAssertTrue("강".contains(regex(#"\p{script=hangul}"#))) + // White_Space + XCTAssertTrue(" ".contains(regex(#"\p{whitespace}"#))) + XCTAssertTrue("\n".contains(regex(#"\p{White_Space}"#))) + XCTAssertFalse("a".contains(regex(#"\p{whitespace}"#))) + // Alphabetic + XCTAssertTrue("aéîøüƒ".contains(regex(#"^\p{Alphabetic}+$"#))) + // Hangul_Syllable_Type + // Unsupported + // Noncharacter_Code_Point + XCTAssertTrue("\u{10FFFF}".contains(regex(#"\p{Noncharacter_Code_Point}"#))) + // Default_Ignorable_Code_Point + XCTAssertTrue("\u{00AD}".contains(regex(#"\p{Default_Ignorable_Code_Point}"#))) + // Deprecated + XCTAssertTrue("ʼn".contains(regex(#"\p{Deprecated}"#))) // Logical_Order_Exception + XCTAssertTrue("ແ".contains(regex(#"\p{Logical_Order_Exception}"#))) // Variation_Selector + XCTAssertTrue("\u{FE07}".contains(regex(#"\p{Variation_Selector}"#))) // MARK: Numeric // Numeric_Value + XCTAssertTrue("3".contains(regex(#"\p{Numeric_Value=3}"#))) + XCTAssertFalse("4".contains(regex(#"\p{Numeric_Value=3}"#))) + XCTAssertTrue("④".contains(regex(#"\p{Numeric_Value=4}"#))) + XCTAssertTrue("⅕".contains(regex(#"\p{Numeric_Value=0.2}"#))) + // Numeric_Type + XCTAssertTrue("3".contains(regex(#"\p{Numeric_Type=Decimal}"#))) + XCTAssertFalse("4".contains(regex(#"\p{Numeric_Type=Digit}"#))) + // Hex_Digit + XCTAssertTrue("0123456789abcdef0123456789ABCDEF" + .contains(regex(#"^\p{Hex_Digit}+$"#))) + XCTAssertFalse("0123456789abcdefg".contains(regex(#"^\p{Hex_Digit}+$"#))) // ASCII_Hex_Digit + XCTAssertTrue("0123456789abcdef".contains(regex(#"^\p{ASCII_Hex_Digit}+$"#))) + XCTAssertFalse("0123456789abcdef0123456789ABCDEF" + .contains(regex(#"^\p{ASCII_Hex_Digit}+$"#))) // MARK: Identifiers - // ID_Continue // ID_Start - // XID_Continue + XCTAssertTrue("ABcd".contains(regex(#"^\p{ID_Start}+$"#))) + XCTAssertFalse(" ':`-".contains(regex(#"\p{ID_Start}"#))) + + // ID_Continue + XCTAssertTrue("ABcd_1234".contains(regex(#"^\p{ID_Continue}+$"#))) + XCTAssertFalse(" ':`-".contains(regex(#"\p{ID_Continue}"#))) + // XID_Start + XCTAssertTrue("ABcd".contains(regex(#"^\p{XID_Start}+$"#))) + XCTAssertFalse(" ':`-".contains(regex(#"\p{XID_Start}"#))) + + // XID_Continue + XCTAssertTrue("ABcd_1234".contains(regex(#"^\p{XID_Continue}+$"#))) + XCTAssertFalse(" ':`-".contains(regex(#"\p{XID_Continue}"#))) + // Pattern_Syntax + XCTAssertTrue(".+-:".contains(regex(#"^\p{Pattern_Syntax}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Pattern_Syntax}"#))) + // Pattern_White_Space + XCTAssertTrue(" \t\n".contains(regex(#"^\p{Pattern_White_Space}+$"#))) + XCTAssertFalse("abc123".contains(regex(#"\p{Pattern_White_Space}"#))) + // Identifier_Status + // Unsupported + // Identifier_Type + // Unsupported // MARK: CJK // Ideographic + XCTAssertTrue("微笑".contains(regex(#"^\p{IsIdeographic}+$"#))) + XCTAssertFalse("abc123".contains(regex(#"\p{Ideographic}"#))) + // Unified_Ideograph + XCTAssertTrue("微笑".contains(regex(#"^\p{Unified_Ideograph}+$"#))) + XCTAssertFalse("abc123".contains(regex(#"\p{Unified_Ideograph}"#))) + // Radical + XCTAssertTrue("⺁⺂⺆".contains(regex(#"^\p{Radical}+$"#))) + // IDS_Binary_Operator + XCTAssertTrue("⿰⿸⿻".contains(regex(#"^\p{IDS_Binary_Operator}+$"#))) + // IDS_Trinary_Operator + XCTAssertTrue("⿲⿳".contains(regex(#"^\p{IDS_Trinary_Operator}+$"#))) + // Equivalent_Unified_Ideograph - XCTExpectFailure { - XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#) - // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)) - } + // Unsupported // MARK: Case // Uppercase + XCTAssertTrue("AÉÎØÜ".contains(regex(#"^\p{isUppercase}+$"#))) + XCTAssertFalse("123abc".contains(regex(#"^\p{isUppercase}+$"#))) + // Lowercase + XCTAssertTrue("aéîøü".contains(regex(#"^\p{Lowercase}+$"#))) + XCTAssertFalse("123ABC".contains(regex(#"\p{Lowercase}+$"#))) + // Simple_Lowercase_Mapping + XCTAssertTrue("aAa".contains(regex(#"^\p{Simple_Lowercase_Mapping=a}+$"#))) + XCTAssertFalse("bBå".contains(regex(#"\p{Simple_Lowercase_Mapping=a}"#))) + // Simple_Titlecase_Mapping + XCTAssertTrue("aAa".contains(regex(#"^\p{Simple_Titlecase_Mapping=A}+$"#))) + XCTAssertFalse("bBå".contains(regex(#"\p{Simple_Titlecase_Mapping=A}"#))) + // Simple_Uppercase_Mapping + XCTAssertTrue("aAa".contains(regex(#"^\p{Simple_Uppercase_Mapping=A}+$"#))) + XCTAssertFalse("bBå".contains(regex(#"\p{Simple_Uppercase_Mapping=A}"#))) + // Simple_Case_Folding + // Unsupported + // Soft_Dotted + XCTAssertTrue("ijɨʝⅈⅉ".contains(regex(#"^\p{Soft_Dotted}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Soft_Dotted}"#))) + // Cased + XCTAssertTrue("A".contains(regex(#"\p{Cased}"#))) + XCTAssertTrue("A".contains(regex(#"\p{Is_Cased}"#))) + XCTAssertFalse("0".contains(regex(#"\p{Cased}"#))) + // Case_Ignorable + XCTAssertTrue(":".contains(regex(#"\p{Case_Ignorable}"#))) + XCTAssertFalse("a".contains(regex(#"\p{Case_Ignorable}"#))) + // Changes_When_Lowercased + XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Lowercased}"#))) + XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Lowercased=true}"#))) + XCTAssertFalse("a".contains(regex(#"\p{Changes_When_Lowercased}"#))) + // Changes_When_Uppercased XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased}"#))) XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased=true}"#))) XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Uppercased}"#))) + // Changes_When_Titlecased + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Titlecased=true}"#))) + XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Titlecased}"#))) + // Changes_When_Casefolded - // Changes_When_Casemapped + XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Casefolded=true}"#))) + XCTAssertFalse("a".contains(regex(#"\p{Changes_When_Casefolded}"#))) + XCTAssertFalse(":".contains(regex(#"\p{Changes_When_Casefolded}"#))) + // Changes_When_Casemapped + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Casemapped}"#))) + XCTAssertFalse(":".contains(regex(#"\p{Changes_When_Casemapped}"#))) + // MARK: Normalization // Canonical_Combining_Class + XCTAssertTrue("\u{0321}\u{0322}\u{1DD0}".contains(regex(#"^\p{Canonical_Combining_Class=202}+$"#))) + XCTAssertFalse("123".contains(regex(#"\p{Canonical_Combining_Class=202}"#))) + // Decomposition_Type + // Unsupported + // NFC_Quick_Check + // Unsupported + // NFKC_Quick_Check + // Unsupported + // NFD_Quick_Check + // Unsupported + // NFKD_Quick_Check + // Unsupported + // NFKC_Casefold + // Unsupported + // Changes_When_NFKC_Casefolded + XCTAssertTrue("ABCÊÖ".contains(regex(#"^\p{Changes_When_NFKC_Casefolded}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Changes_When_NFKC_Casefolded}"#))) // MARK: Emoji // Emoji + XCTAssertTrue("🥰🥳🤩".contains(regex(#"^\p{Emoji}+$"#))) + XCTAssertFalse("abc ◎✩℥".contains(regex(#"\p{Emoji}"#))) + // Emoji_Presentation + XCTAssertTrue("⌚☕☔".contains(regex(#"^\p{Emoji_Presentation}+$"#))) + XCTAssertFalse("abc ǽǮ".contains(regex(#"\p{Emoji_Presentation}"#))) + // Emoji_Modifier + XCTAssertTrue("\u{1F3FB}\u{1F3FC}\u{1F3FD}".contains(regex(#"^\p{Emoji_Modifier}+$"#))) + XCTAssertFalse("🧒".contains(regex(#"\p{Emoji_Modifier}"#))) + // Emoji_Modifier_Base + XCTAssertTrue("🧒".contains(regex(#"^\p{Emoji_Modifier_Base}+$"#))) + XCTAssertFalse("123 🧠".contains(regex(#"\p{Emoji_Modifier_Base}"#))) + // Emoji_Component + // Unsupported + // Extended_Pictographic + // Unsupported + // Basic_Emoji* + // Unsupported + // Emoji_Keycap_Sequence* + // Unsupported + // RGI_Emoji_Modifier_Sequence* + // Unsupported + // RGI_Emoji_Flag_Sequence* + // Unsupported + // RGI_Emoji_Tag_Sequence* + // Unsupported + // RGI_Emoji_ZWJ_Sequence* + // Unsupported + // RGI_Emoji* + // Unsupported // MARK: Shaping and Rendering // Join_Control + XCTAssertTrue("\u{200C}\u{200D}".contains(regex(#"^\p{Join_Control}+$"#))) + XCTAssertFalse("123".contains(regex(#"\p{Join_Control}"#))) + // Joining_Group + // Unsupported + // Joining_Type + // Unsupported + // Vertical_Orientation + // Unsupported + // Line_Break + // Unsupported + // Grapheme_Cluster_Break + // Unsupported + // Sentence_Break + // Unsupported + // Word_Break + // Unsupported + // East_Asian_Width + // Unsupported + // Prepended_Concatenation_Mark + // Unsupported // MARK: Bidirectional // Bidi_Class + // Unsupported + // Bidi_Control + XCTAssertTrue("\u{200E}\u{200F}\u{2069}".contains(regex(#"^\p{Bidi_Control}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Bidi_Control}"#))) + // Bidi_Mirrored + XCTAssertTrue("()<>{}❮❯«»".contains(regex(#"^\p{Bidi_Mirrored}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Bidi_Mirrored}"#))) + // Bidi_Mirroring_Glyph + // Unsupported + // Bidi_Paired_Bracket + // Unsupported + // Bidi_Paired_Bracket_Type + // Unsupported // MARK: Miscellaneous // Math + XCTAssertTrue("𝒶𝖇𝕔𝖽𝗲𝘧𝙜𝚑𝛊𝜅𝝀𝝡𝞰𝟙𝟐𝟯𝟺".contains(regex(#"^\p{Math}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Math}"#))) + // Quotation_Mark + XCTAssertTrue(#"“«‘"’»”"#.contains(regex(#"^\p{Quotation_Mark}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Quotation_Mark}"#))) + // Dash + XCTAssertTrue("—-–".contains(regex(#"^\p{Dash}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Dash}"#))) + // Sentence_Terminal + XCTAssertTrue(".!?".contains(regex(#"^\p{Sentence_Terminal}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Sentence_Terminal}"#))) + // Terminal_Punctuation + XCTAssertTrue(":?!.".contains(regex(#"^\p{Terminal_Punctuation}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Terminal_Punctuation}"#))) + // Diacritic + XCTAssertTrue("¨`^¯ʸ".contains(regex(#"^\p{Diacritic}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Diacritic}"#))) + // Extender + XCTAssertTrue("ᪧː々".contains(regex(#"^\p{Extender}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Extender}"#))) + // Grapheme_Base + XCTAssertTrue("abc".contains(regex(#"^\p{Grapheme_Base}+$"#))) + XCTAssertFalse("\u{301}\u{FE0F}".contains(regex(#"\p{Grapheme_Base}"#))) + // Grapheme_Extend + XCTAssertTrue("\u{301}\u{302}\u{303}".contains(regex(#"^\p{Grapheme_Extend}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Grapheme_Extend}"#))) + // Regional_Indicator + XCTAssertTrue("🇰🇷🇬🇭🇵🇪".contains(regex(#"^\p{Regional_Indicator}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Regional_Indicator}"#))) + } + + func testFullProperties_Unsupported() { + // Block + XCTAssertThrowsError(try Regex(#"\p{block=Block_Elements}"#)) + + // Hangul_Syllable_Type + XCTAssertThrowsError(try Regex(#"\p{Hangul_Syllable_Type=L}/"#)) + + // Identifier_Status + XCTAssertThrowsError(try Regex(#"\p{Identifier_Status=Allowed}"#)) + + // Identifier_Type + XCTAssertThrowsError(try Regex(#"\p{Identifier_Type=Inclusion}/"#)) + + // Equivalent_Unified_Ideograph + XCTAssertThrowsError(try Regex(#"\p{Equivalent_Unified_Ideograph=⼚}"#)) + + // Simple_Case_Folding + XCTAssertThrowsError(try Regex(#"\p{Simple_Case_Folding=a}/"#)) + + // Decomposition_Type + XCTAssertThrowsError(try Regex(#"\p{Decomposition_Type}"#)) + + // NFC_Quick_Check + XCTAssertThrowsError(try Regex(#"\p{NFC_Quick_Check}"#)) + + // NFKC_Quick_Check + XCTAssertThrowsError(try Regex(#"\p{NFKC_Quick_Check}"#)) + + // NFD_Quick_Check + XCTAssertThrowsError(try Regex(#"\p{NFD_Quick_Check}"#)) + + // NFKD_Quick_Check + XCTAssertThrowsError(try Regex(#"\p{NFKD_Quick_Check}"#)) + + // NFKC_Casefold + XCTAssertThrowsError(try Regex(#"\p{NFKC_Casefold}"#)) + + // Emoji_Component + XCTAssertThrowsError(try Regex(#"\p{Emoji_Component}"#)) + + // Extended_Pictographic + XCTAssertThrowsError(try Regex(#"\p{Extended_Pictographic}"#)) + + // Basic_Emoji* + XCTAssertThrowsError(try Regex(#"\p{Basic_Emoji*}"#)) + + // Emoji_Keycap_Sequence* + XCTAssertThrowsError(try Regex(#"\p{Emoji_Keycap_Sequence*}"#)) + + // RGI_Emoji_Modifier_Sequence* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji_Modifier_Sequence*}"#)) + + // RGI_Emoji_Flag_Sequence* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji_Flag_Sequence*}"#)) + + // RGI_Emoji_Tag_Sequence* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji_Tag_Sequence*}"#)) + + // RGI_Emoji_ZWJ_Sequence* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji_ZWJ_Sequence*}"#)) + + // RGI_Emoji* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji*}"#)) + + // Joining_Group + XCTAssertThrowsError(try Regex(#"\p{Joining_Group}"#)) + + // Joining_Type + XCTAssertThrowsError(try Regex(#"\p{Joining_Type}"#)) + + // Vertical_Orientation + XCTAssertThrowsError(try Regex(#"\p{Vertical_Orientation}"#)) + + // Line_Break + XCTAssertThrowsError(try Regex(#"\p{Line_Break}"#)) + + // Grapheme_Cluster_Break + XCTAssertThrowsError(try Regex(#"\p{Grapheme_Cluster_Break}"#)) + + // Sentence_Break + XCTAssertThrowsError(try Regex(#"\p{Sentence_Break}"#)) + + // Word_Break + XCTAssertThrowsError(try Regex(#"\p{Word_Break}"#)) + + // East_Asian_Width + XCTAssertThrowsError(try Regex(#"\p{East_Asian_Width}"#)) + + // Prepended_Concatenation_Mark + XCTAssertThrowsError(try Regex(#"\p{Prepended_Concatenation_Mark}"#)) + + // Bidi_Class + XCTAssertThrowsError(try Regex(#"\p{Bidi_Class}"#)) + + // Bidi_Mirroring_Glyph + XCTAssertThrowsError(try Regex(#"\p{Bidi_Mirroring_Glyph}"#)) + + // Bidi_Paired_Bracket + XCTAssertThrowsError(try Regex(#"\p{Bidi_Paired_Bracket}"#)) + + // Bidi_Paired_Bracket_Type + XCTAssertThrowsError(try Regex(#"\p{Bidi_Paired_Bracket_Type}"#)) } }