diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..330d1674f --- /dev/null +++ b/.gitignore @@ -0,0 +1,90 @@ +# Xcode +# +# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore + +## User settings +xcuserdata/ + +## compatibility with Xcode 8 and earlier (ignoring not required starting Xcode 9) +*.xcscmblueprint +*.xccheckout + +## compatibility with Xcode 3 and earlier (ignoring not required starting Xcode 4) +build/ +DerivedData/ +*.moved-aside +*.pbxuser +!default.pbxuser +*.mode1v3 +!default.mode1v3 +*.mode2v3 +!default.mode2v3 +*.perspectivev3 +!default.perspectivev3 + +## Obj-C/Swift specific +*.hmap + +## App packaging +*.ipa +*.dSYM.zip +*.dSYM + +## Playgrounds +timeline.xctimeline +playground.xcworkspace + +# Swift Package Manager +# +# Add this line if you want to avoid checking in source code from Swift Package Manager dependencies. +# Packages/ +# Package.pins +# Package.resolved +# *.xcodeproj +# +# Xcode automatically generates this directory with a .xcworkspacedata file and xcuserdata +# hence it is not needed unless you have added a package configuration file to your project +# .swiftpm + +.build/ + +# CocoaPods +# +# We recommend against adding the Pods directory to your .gitignore. However +# you should judge for yourself, the pros and cons are mentioned at: +# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control +# +# Pods/ +# +# Add this line if you want to avoid checking in source code from the Xcode workspace +# *.xcworkspace + +# Carthage +# +# Add this line if you want to avoid checking in source code from Carthage dependencies. +# Carthage/Checkouts + +Carthage/Build/ + +# Accio dependency management +Dependencies/ +.accio/ + +# fastlane +# +# It is recommended to not store the screenshots in the git repo. +# Instead, use fastlane to re-generate the screenshots whenever they are needed. +# For more information about the recommended setup visit: +# https://docs.fastlane.tools/best-practices/source-control/#source-control + +fastlane/report.xml +fastlane/Preview.html +fastlane/screenshots/**/*.png +fastlane/test_output + +# Code Injection +# +# After new code Injection tools there's a generated folder /iOSInjectionProject +# https://github.com/johnno1962/injectionforxcode + +iOSInjectionProject/ diff --git a/Documentation/BigPicture.md b/Documentation/BigPicture.md new file mode 100644 index 000000000..d3b5205e5 --- /dev/null +++ b/Documentation/BigPicture.md @@ -0,0 +1,80 @@ + +# The Big Picture + +* Author: [Michael Ilseman][milseman] + +## Introduction + +I've been finding it helpful to think of our long-term goal as making Swift awesome at string processing, data processing, and "event processing" (working title, suggestions welcome). These are not rigid or clear-cut distinct domains (they actually blend together in extremity) so much as they are 3 interesting "regions" in this design space. Thinking about these regions helps clarify what tasks we're enabling and helps push us towards more general solutions. + +Each of these regions share technical fundamentals, but present novel performance and API design challenges. I hope that keeping the big picture in mind will help guide the design process towards pragmatic trade-offs and robust solutions. + +By "string processing" (at least in the context of this document), I mean processing with the Unicode-rich semantics of Swift's `String` and `Character` types. By "data processing", I mean efficient processing done at a binary semantics level, even if such data happens to be viewable as text. By "event processing" (working title, suggestions welcome), I mean being able to detect and respond to patterns over ephemeral "events" issued from an asynchronous source. + +We want to be able to compose, layer, and even interweave different kinds of processing together. And, we want these areas to be library-extensible, so that libraries can provide custom behavior through custom protocol conformances. For example, [custom String interpolation](https://github.com/apple/swift-evolution/blob/master/proposals/0228-fix-expressiblebystringinterpolation.md) is extended by libraries for [logging](https://developer.apple.com/documentation/os/logging/generating_log_messages_from_your_code), [sanitizing](https://nshipster.com/expressiblebystringinterpolation/#implementing-a-custom-string-interpolation-type), [templating](https://github.com/ilyapuchka/Interplate), and many other applications. Similarly, there are myriad formulations of pattern matching and we want to enable libraries to provide powerful new abstractions. + +## String processing + +Swift's `String` is presented as a collection of `Character`s, or [extended grapheme clusters][grapheme-cluster]. Thus, a wildcard match such as a `.` in a regular expression should match a `Character`, `Unicode.Scalar`, or `UInt8` when applied to `String`, `String.UnicodeScalarView`, or `String.UTF8View` respectively: + +| Matching "🧟‍♀️" using `.` wildcard | Matched | Remaining content | +|-------------------------------------|---------|----------------------------| +| String | 🧟‍♀️ | "" | +| String.UnicodeScalarView | U+1F9DF | U+200D U+2640 | +| String.UTF8View | F0 | 9F A7 9F E2 80 8D E2 99 80 | + + +`String` and `Character` comparison honors [Unicode Canonical Equivalence][canonical-equivalence]: `"é"` (U+00E8 Latin Small Letter E with Grave) compares equally to `"e\u{0300}"` (U+0065 Latin Small Letter E, U+0300 Combining Grave Accent). The standard library aims to provide this level of Unicode support while paying the minimal performance cost possible by only performing the more complex analysis when needed. + +We aim for string processing to be library-extensible, meaning high level frameworks and platforms can provide linguistically-rich interfaces through protocol conformances. String processors should be composable, so that one can e.g. seamlessly call into Foundation's `FormatStyle`s to do rich, localized or standards-conforming parsing as part of a larger string processing operation. + +We also aim to generalize to `Collection` processing. A simple example of this is applying the wildcard `.` to each of String's views above, in effect executing a simple generic pattern over different collections. This ties in naturally with [Collection consumers and searchers](https://forums.swift.org/t/prototype-protocol-powered-generic-trimming-searching-splitting/29415), as collection processors can conform to `CollectionConsumer` enabling them to be used with generic API. This conformance is also a means of composition, as combinators can combine consumers together. + +In extremity, when `Element` is trivial ("plain ol' data"), contiguously stored in memory (or at least presented as contiguous chunks in memory), and processing can be done within a moving window over the input, then we are approaching the realm of data processing. + + +## Data processing + +Data processing can span from low-level efficient binary deserializers to industrial strength parsers. Many examples that appear to be string processing are fundamentally data processing with a layer of string processing on top. Formats such as JSON, CSV, plists, and even source code are data formats that have a textual presentation when rendered by an editor or terminal. Their stored content may be Unicode-rich, but the format itself should be processed as data. + +For example, imagine processing a CSV document of all defined Unicode scalars. Inside such a document would appear the CSV field-separator `,` followed by `U+0301` (Combining Acute Accent). If we were doing string processing, this would appear to us as a single ([degenerate][degenerates]) grapheme cluster `,́`, that is the comma combined with the following accent into a single grapheme cluster that does not compare equal to either piece. Instead, we want to process CSV at the binary-semantics level where we match field separators literally and everything in-between is opaque content which we present for interpretation by a higher-level layer. + +We want to support composition of data processing with string processing. An example of multi-tiered string-over-data processing is parsing JSON into strongly typed data structures such as `Dictionary`s with `String` keys. Parsing JSON is done at the binary-semantics level, but interpreting the *content* of a JSON field uses `String`'s semantics such that dictionary keys are uniqued under Unicode canonical equivalence. We want to allow tiered string processing to drive the behavior of the data processing layer, such that Unicode-rich analysis of a token can affect parsing (e.g. is this a Unicode-rich identifier or Unicode-rich operator). + +Backtracking during data processing is usually limited or constrained, and processing often happens over a contiguous moving window, making it amenable to processing contiguous chunks derived from an asynchronous source. There may or may not be a notion of position in the input data (e.g. `seek`-able files vs device files). + +API design challenges include squaring the circle of `Sequence` and `AsyncSequence`, how we express such window sizes (e.g. look-ahead) and/or backtracking constraints, designing the core low-level "peek" and "consume" pattern, clarifying buffer ownership and management, and allowing parsers to self-direct their behavior (e.g. error handling and early pruning). + +Performance challenges include avoiding fully-generic evaluation, managing buffers and avoiding intermediary ones, generating compact code comparable in performance to hand-written parsers, and using window size to limit memory usage. + +In extremity, when the chunk size is 1, the `Element` type is ephemeral or otherwise unpersistable, there is no concept of position, and we need to process virtually infinite input within a small time/memory budget, we are approaching the realm of event processing. + +## Event processing + +Event processing is the fuzziest area of this design space (at least for me), as there's not a ton of direct precedent in common Swift code. And yet, much application logic can be thought of as state machines responding live to user input, and critical safety invariants can be thought of as simple logics evaluated over abstract program traces. + +For example, a server process might wish to enforce the invariant that an approval of a user `u` to access some resource requires that `u` first be authenticated. An engine can allocate a bit per user in a bitvector and actively monitor an event stream, setting the corresponding bit upon authentication. If an approve happens without the authentication bit being set, the engine invokes custom handling code, which could do anything from logging that tidbit to tearing down the server process and drafting a security advisory. This is scalable and efficient: it scales to virtually-infinite trace histories because it only cares about a single bit-per-user of history: whether `u` was authenticated. + +Implementation techniques involve heavy use of bitvectors and, if events can map to the natural numbers, highly-specialized data structures such as [SparseSets][sparse-set]. + +API design challenges include expressing this mapping to the natural numbers when possible and providing a matching engine API for custom hooks to use (e.g. to retrieve more information about *how* the program got to its current state). Performance challenges include taking advantage of this mapping and compiling rich logics to efficient code. Implementation challenges include sharing the technical infrastructure for generic events without an `Index` type or other notion of position. Event processing shares the same technical fundamentals as string and data processing, but stresses asynchronous API design and directly interacting with the matching engine. + +## Developer experience + +We want to provide a developer experience more akin to parser-combinators: i.e. you're just writing code and calling or composing functions normally. If you need to do something custom, you can just write custom code. We want to provide a build-system experience similar to normal code: there's no extra steps or external tools to invoke. We want to provide a compilation model more akin to parser-generator tools: we provide a large number of constructs (alternation, repetition, etc.) whose semantics are statically-known with little bits of custom user code scattered about. When that custom code is available in the current compilation context, we can even perform (and evolve over time) cross-cutting analysis and optimizations. + +We want powerful functionality presented as normal Swift code, not forced into a particular formalism. In academia, the computational complexity class of a formalism is often the most salient point. That's a *really nice* thing to have and know, but it's usually not even in the top-5 concerns. For example, imagine adding a typo-correction feature to an existing parser for a programming language: surfacing in-scope corrections would be context-sensitive, and furthermore, candidates would be weighted by things such as edit distance or even lexical distance. + + +## Where to go from here + +Most early discussion will be happening in the context of string processing or generic collection processing (with some data parsing thrown in). Since implementation details can quickly become binary compatibility requirements, we'll want to be running ahead to make sure data and event processing works or can be made to work in the future with the API and ABI we ship. We'll want to support asynchronous sources of content so as not to overly fixate on `Collection`, and most notably, `Index` as a representation of position. + +For more musings on implementation strategies, see [Implementation Musings][impl-musings] + +[milseman]: https://github.com/milseman +[sparse-set]: https://github.com/apple/swift-collections/pull/80 +[impl-musings]: ImplementationMusings.md +[grapheme-cluster]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries +[canonical-equivalence]: https://www.unicode.org/reports/tr15/#Canon_Compat_Equivalence +[degenerates]: https://www.unicode.org/reports/tr29/#Rule_Constraints diff --git a/Documentation/DeclarativeStringProcessing.md b/Documentation/DeclarativeStringProcessing.md new file mode 100644 index 000000000..b2a4fa5ec --- /dev/null +++ b/Documentation/DeclarativeStringProcessing.md @@ -0,0 +1,437 @@ +# Declarative String Processing Overview + +## Introduction + +String processing is hard and the current affordances provided by the Swift Standard Library are underpowered. We propose adding two new _declarative_ string processing APIs—a familiar `Regex` literal and a more powerful `Pattern` result builder—to help make Swift string processing fast and easy. + +This is a large feature that will ultimately be divided into multiple Swift Evolution proposals. This initial pitch is intended to prompt discussion about the high level direction and to introduce the key prongs of the feature and their relationship to one another. + +This overview is the work of a number of members of the Swift team (Alex Alonso, Nate Cook, Michael Ilseman, Kyle Macomber, Becca Royal-Gordon, Tim Vermeulen, and Richard Wei) as well as Ishan Bhargava, who implemented a [prototype][ishan] of regular expression literals with strongly-typed captures. + +## Example + +The Swift Standard Library is implementing [native grapheme breaking][grapheme-breaking-pr] for `String`, which requires preprocessing [Unicode data tables][grapheme-break-table]. + +Here's a snippet of the data: + +```txt +# ================================================ + +000A ; LF # Cc + +# Total code points: 1 + +# ================================================ + +0000..0009 ; Control # Cc [10] .. +000B..000C ; Control # Cc [2] .. +000E..001F ; Control # Cc [18] .. +007F..009F ; Control # Cc [33] .. +00AD ; Control # Cf SOFT HYPHEN +061C ; Control # Cf ARABIC LETTER MARK +180E ; Control # Cf MONGOLIAN VOWEL SEPARATOR +200B ; Control # Cf ZERO WIDTH SPACE +200E..200F ; Control # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK +2028 ; Control # Zl LINE SEPARATOR +2029 ; Control # Zp PARAGRAPH SEPARATOR +202A..202E ; Control # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE +``` + +Each relevant line is of the form: + +```txt +0000..10FFFF ; Property # Comment +``` + +- The first column (delimited by the `;`) is a hex number or range of hex numbers that represent Unicode scalar values. +- The second column is the grapheme break property that applies to this range of scalars. +- Everything after the `#` is a comment. +- Entries are separated by newlines. + +This is a very simple data format to process, and when we try to do so we quickly see inadequacies with the status quo. + +### Naive handwritten parser + +A straight-forward approach to tackling this problem is to use the standard library's generic collection algorithms like `split`, `map`, and `filter`. + +```swift +extension Unicode.Scalar { + // Try to convert a hexadecimal string to a scalar + init?(hex: S) { + guard let val = UInt32(hex, radix: 16), let scalar = Self(val) else { + return nil + } + self = scalar + } +} + +func graphemeBreakPropertyData( + forLine line: String +) -> (scalars: ClosedRange, property: Unicode.GraphemeBreakProperty)? { + let components = line.split(separator: ";") + guard components.count >= 2 else { return nil } + + let splitProperty = components[1].split(separator: "#") + let filteredProperty = splitProperty[0].filter { !$0.isWhitespace } + guard let property = Unicode.GraphemeBreakProperty(filteredProperty) else { + return nil + } + + let scalars: ClosedRange + let filteredScalars = components[0].filter { !$0.isWhitespace } + if filteredScalars.contains(".") { + let range = filteredScalars + .split(separator: ".") + .map { Unicode.Scalar(hex: $0)! } + scalars = range[0] ... range[1] + } else { + let scalar = Unicode.Scalar(hex: filteredScalars)! + scalars = scalar ... scalar + } + return (scalars, property) +} +``` + +This code gets the job done, but it suffers in readability, maintainability, and scalability. + +- It is difficult to read and understand quickly, one has to mentally process the line multiple times. +- Hardcoded subscripts, force unwraps, etc., are fragile to changes in the format or the script itself. +- This does multiple passes over the input, allocating multiple temporary data structures in the process. + +Ideally, we'd process this string the same way we read the file: from left to right. + +### Single-pass handwritten parser + +By following a [consumer pattern][consumers], we can extract the relevant information in a single pass over the input. + +```swift +// ... consumer helpers like `eat(exactly:)`, `eat(while:)`, and `peek()` ... + +// Try to parse a Unicode scalar off the input +private func parseScalar(_ str: inout Substring) -> Unicode.Scalar? { + let val = str.eat(while: { $0.isHexDigit }) + guard !val.isEmpty else { return nil } + + // Subtle potential bug: if this init fails, we need to restore + // str.startIndex. Because of how this is currently called, the bug won't + // manifest now, but could if the call site is changed. + return Unicode.Scalar(hex: val) +} + +func graphemeBreakPropertyData( + forLine line: String +) -> (scalars: ClosedRange, property: Unicode.GraphemeBreakProperty)? { + var line = line[...] + guard let lower = parseScalar(&line) else { + // Comment or whitespace line + return nil + } + + let upper: Unicode.Scalar + if line.peek(".") { + guard !line.eat(exactly: "..").isEmpty else { + fatalError("Parse error: invalid scalar range") + } + guard let s = parseScalar(&line) else { + fatalError("Parse error: expected scalar upperbound") + } + upper = s + } else { + upper = lower + } + + line.eat(while: { !$0.isLetter }) + let name = line.eat(while: { $0.isLetter || $0 == "_" }) + guard let prop = Unicode.GraphemeBreakProperty(name) else { + return nil + } + + return (lower ... upper, prop) +} +``` + +This implementation is more scalable and maintainable, but at the cost of approachability. + +- It executes in a single pass over the input, without intermediary allocations. +- Buried assumptions in the naive code are explicit failures here. +- But, this consumer pattern is very low-level and using it well requires care and expertise. For example, backtracking has to be manually handled and reasoned about, as unnecessary backtracking quickly saps performance. + +## Proposed Solution + +Declarative APIs for string processing have the potential to be approachable, maintainable, _and_ scalable. + +### Regular Expressions + +A commonly used tool for this kind of pattern matching and data extraction is [regular expressions][regex-wikipedia]. + +Consider these two lines: + +```txt +007F..009F ; Control # Cc [33] .. +00AD ; Control # Cf SOFT HYPHEN +``` + +We can match them and extract the data using the regular expression: + +```re +/([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s(\w+).*/ +``` + +Let's break it down: + +- `([0-9A-F]+)` matches one or more hex digits, capturing the first scalar +- `(?:\.\.([0-9A-F]+))?` optionally matches the `..` and captures the second scalar +- `\s*;\s` matches one or more spaces, a semicolon, and a space +- `(\w+)` matches one or more word characters, capturing the grapheme break property +- `.*` matches zero or more of any character (the rest of the line) + +We propose adding a new regular expression literal, with strongly typed captures, to Swift. Using `Regex`, we can re-implement `graphemeBreakPropertyData` like so: + +```swift +func graphemeBreakPropertyData( + forLine line: String +) -> (scalars: ClosedRange, property: Unicode.GraphemeBreakProperty)? { + line + .match(/([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s(\w+).*/)? + .captures.flatMap { (l, u, p) in + guard let property = Unicode.GraphemeBreakProperty(p) else { + return nil + } + let scalars = Unicode.Scalar(hex: l)! ... Unicode.Scalar(hex: u ?? l)! + return (scalars, property) + } +} +``` + +This code reads from left to right and doesn't require any hard-coded indices. `Regex` is generic over its captures, which the compiler infers from the capturing groups in the literal: + +```swift +let regex = /([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s(\w+).*/ +print(type(of: regex)) +// Prints Regex<(Substring, Substring?, Substring)> +``` + +> ***Note**: The type of the second capture, the end of the scalar range, is optional. This is because the corresponding capture group in the regex is optional. (If the capturing group was repeated using a `*` or `+` quantifier, it would correspond to a lazy collection.)* + +Strongly typed captures make it more convenient and safer to post-process match results, e.g. enabling the use of tuple destructuring and the nil-coalescing operator in `graphemeBreakPropertyData`. + +Regular expressions are compact, powerful, and often fast. Their syntax is _familiar_ and _ubiquitous_. Though regexes come in different flavors, knowledge and patterns acquired in one tool (e.g. Perl) can often be applied in another (e.g. Xcode). An important goal of adding regular expressions to Swift is to facilitate this kind of reuse. + +### From `Regex` to `Pattern` + +Regular expressions originated for use in Unix command-line arguments and text editor search fields because they are very terse, representing in a single line what otherwise would be an entire program. Due to this lineage, regular expression syntax has a few disadvantages, especially when used within a general-purpose programming language: + +- The terse syntax can be hard to remember (does `\w` mean "word" or "whitespace"?) and difficult to read (the `..` and `;` in the example regex are obfuscated by the metacharacters). +- The lack of library support encourages reinventing the wheel—simplistic regexes are commonly misused to parse deceptively complex formats like dates and currency, when rich libraries of sophisticated parsers already exist (e.g. Foundation's `FormatStyle`). +- Regular expressions occupy an awkward middle ground of being too powerful to compose together, but not powerful enough to recognize recursive structures (contrast with [PEGs][peg]). Extensions such as back-references catapult matching to being [NP-complete][regex-np], yet they still cannot be used to write parsers. + +Swift prizes clarity over terseness. Regular expressions are great for simple matching, but as they grow in complexity we want to be able to bring the full power of Swift and its libraries to bear. + +### `Pattern` Builder + +The downsides of regular expressions motivate a more versatile result builder syntax for declaring a `Pattern`: + +```swift +func graphemeBreakPropertyData( + forLine line: String +) -> (scalars: ClosedRange, property: Unicode.GraphemeBreakProperty)? { + line.match { + OneOrMore(.hexDigit).capture { Unicode.Scalar(hex: $0) } + + Optionally { + ".." + OneOrMore(.hexDigit).capture { Unicode.Scalar(hex: $0) } + } + + OneOrMore(.whitespace) + ";" + OneOrMore(.whitespace) + + OneOrMore(.word).capture(GraphemeBreakProperty.init) + + Repeat(.anyCharacter) + }?.captures.map { (lower, upper, property) in + let scalars = lower ... (upper ?? lower) + return (scalars, property) + } +} +``` + +- Character classes and quantifiers are spelled out, making them more readable and discoverable via code completion. +- String literals make punctuation matching simple and clear: the two dots and the semicolon are critical parts of our format and they stand out inside literals. +- Capture groups can be processed inline, improving locality and strong typing. Compare `Pattern<(Unicode.Scalar, Unicode.Scalar?, GraphemeBreakProperty)>` vs. `Regex<(Substring, Substring?, Substring)>`. *(Inferring the generic argument of `Pattern` from the capturing groups in the result builder will require language improvements.)* +- Failure to construct a `Unicode.Scalar` or `GraphemeBreakProperty` will exit matching early, just like in consumer-pattern code. + +Sophisticated features like inline capture group processing feel right at home with the result builder syntax _because it’s all just regular Swift code_—it isn't nearly as natural to try to force this kind of functionality into the regex literal. + +Consider the last capture group that uses `GraphemeBreakProperty.init`. `GraphemeBreakProperty` is defined as: + +```swift +enum GraphemeBreakProperty: UInt32 { + case control = 0 + case extend = 1 + case prepend = 2 + case spacingMark = 3 + case extendedPictographic = 4 + + init?(_ str: String) { + switch str { + case "Extend": + self = .extend + case "Control", "CR", "LF": + self = .control + case "Prepend": + self = .prepend + case "SpacingMark": + self = .spacingMark + case "Extended_Pictographic": + self = .extendedPictographic + default: + return nil + } + } +} +``` + +If `GraphemeBreakProperty.init` returns `nil`, the match fails. This is really convenient, since the table includes property names we want to ignore. To get the same level of checking with a traditional regex, we would have had to duplicate all the property names into an alternation. Since capture processing participates in pattern matching (i.e. it can signal a match failure), it can be used to prune the search space early, which is an advantage over post-processing the results of a traditional regex. + +This kind of composition is incredibly powerful. `Pattern` supports the interpolation of a wide variety of sophisticated existing parsers, like [Foundation's `FormatStyle`s][format-style]. + +Consider parsing an HTTP header: + +```http +HTTP/1.1 200 OK +Connection: close +Proxy-Connection: close +Via: HTTP/1.1 localhost (IBM-PROXY-WTE) +Date: Thu, 02 Sep 2021 18:05:45 GMT +Server: Apache +X-Frame-Options: SAMEORIGIN +Strict-Transport-Security: max-age=15768000 +Last-Modified: Thu, 02 Sep 2021 17:54:18 GMT +Accept-Ranges: bytes +Content-Length: 6583 +Content-Type: text/html; charset=UTF-8 +``` + +We can extract the HTTP status code, date, and content type with the following pattern: + +```swift +let match = header.match { + Group { + "HTTP/" + Double.FormatStyle() + Int.FormatStyle().capture() + OneOrMore(.letter) + Newline() + } + .skipWhitespace + + Repeating { + Alternation { + Group { + "Date: " + Date.FormatStyle.rfc1123.capture { HTTPHeaderField.date($0) } + Newline() + } + Group { + "Content-Type: " + MimeType.FormatStyle().capture { HTTPHeaderField.contentType($0) } + Newline() + } + Group { + /[-\w]+: .*/ + Newline() + } + } + } +} +.caseInsensitive + +print(type(of: match)) +// Prints (Int, [HTTPHeaderField])? +``` + +### Do we want _both_ `Pattern` and `Regex`? + +Yes! + +`Pattern` uses a more versatile syntax (just regular Swift code!) and supports matching more complex languages than `Regex`. But `Pattern` can't compete with the familiarity and ubiquity of traditional regular expression syntax. `Regex` literals work especially well in conjunction with API such as collection algorithms presented below. + +We think `Pattern` and `Regex` can complement one another by: + +- Allowing the use of `Regex` literals within `Pattern` builders, alongside a rich library of off the shelf parsers. This will let folks choose succinct expressions when they want, but still nudge them towards more powerful and general constructs. +- Adding a refactor action to transform `Regex` literals into `Pattern` builders. This allows rapid prototyping using `Regex` literals with an easy off-ramp to something more maintainable and powerful. + +### Collection Algorithms + +We intended to extend and add generic [consumer and searcher][consumer-searcher] algorithms to the standard library for operating over collections using patterns or regexes. + +Consider `contains`, which today can only check for the presence of a single `Element`: + +```swift +let str = "Hello, World!" +str.contains("Hello") // error: cannot convert value of type 'String' to expected argument type 'String.Element' (aka 'Character') +``` + +As part of this effort, we'll be adding a variant of `contains` that invokes a "searcher" of the same element type: + +```swift +// The below are all equivalent +str.contains("Hello") || str.contains("Goodbye") +str.contains(/Hello|Goodbye/) +str.contains { + Alternation { + "Hello" + "Goodbye" + } +} +``` + +The kinds of algorithms that can be added or enhanced by consumers and searchers include: +- `firstRange(of:)`, `lastRange(of:)`, `allRanges(of:)`, `contains(_:)` +- `split(separator:)` +- `trim(_:)`, `trimPrefix(_:)`, `trimSuffix(_:)` +- `replaceAll(_:with:)`, `removeAll(_:)`, `moveAll(_:to:)` +- `match(_:)`, `allMatches(_:)` + +## Future Work + +The Swift operator `~=` allows libraries to extend syntactic pattern matching by returning whether matching succeeded or not. An [enhancement to this][syntax] would allow libraries to produce a result as part of a _destructuring_ pattern match, allowing patterns and regexes to be used inside `case` syntax and directly bind their captures to variables. + +```swift +func parseField(_ field: String) -> ParsedField { + switch field { + case let text <- /#\s?(.*)/: + return .comment(text) + case let (l, u) <- /([0-9A-F]+)(?:\.\.([0-9A-F]+))?/: + return .scalars(Unicode.Scalar(hex: l) ... Unicode.Scalar(hex: u ?? l)) + case let prop <- GraphemeBreakProperty.init: + return .property(prop) + } +} +``` + +## The "Big Picture" + +"String processing" as presented above is touching on something broader: processing content that might span from simple binary data (`UInt8`) to semantics-rich entities (`Character` or even generic over `Equatable`). Such content may be readily available for fully-synchronous processing, derived in contiguous chunks from an asynchronous source, or we may even be reacting live to some incoming stream of ephemeral content from a fully asynchronous source. + +The "big picture" is a complex multi-dimensional (and non-orthogonal!) design space that we need some way to talk and reason about to make progress. Swift is source- and ABI-stable, meaning decisions we make now can positively or negatively impact the ability of Swift to meet future needs. Thinking ahead about different areas of this design space can help us avoid painting ourselves into a corner and can help guide us toward more general, broadly-useful approaches. + +For musings on this topic, see [The Big Picture][big-picture]. + + +[grapheme-breaking-pr]: https://github.com/apple/swift/pull/37864 +[ucd-processing-script]: https://github.com/apple/swift/pull/37864/files#diff-d3587c4a489cae08c4d8a8fb38379a7b74198a07c9195d6d1f7c0c1cc639dd4e +[grapheme-break-table]: http://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakProperty.txt +[regex-wikipedia]: https://en.wikipedia.org/wiki/Regular_expression +[big-picture]: BigPicture.md +[consumers]: https://github.com/apple/swift-evolution-staging/blob/976ea3a81813f06ec11f00550d4e83f340cf2f7e/Sources/CollectionConsumerSearcher/Eat.swift +[formal-language]: https://en.wikipedia.org/wiki/Formal_language +[consumer-searcher]: https://forums.swift.org/t/prototype-protocol-powered-generic-trimming-searching-splitting/29415 +[regular-language]: https://en.wikipedia.org/wiki/Regular_language +[peg]: https://en.wikipedia.org/wiki/Parsing_expression_grammar +[regex-np]: https://perl.plover.com/NPC/NPC-3SAT.html +[syntax]: https://gist.github.com/milseman/bb39ef7f170641ae52c13600a512782f#pattern-matching-through-conformance-to-pattern +[format-style]: https://developers.apple.com/videos/play/wwdc2021/10109/ +[ishan]: https://github.com/ishantheperson/swift/tree/ishan/regex diff --git a/Documentation/ImplementationMusings.md b/Documentation/ImplementationMusings.md new file mode 100644 index 000000000..c06cef933 --- /dev/null +++ b/Documentation/ImplementationMusings.md @@ -0,0 +1,107 @@ + +# Implementation Musings + +* Author: [Michael Ilseman][milseman] + +## General-purpose matching engine + +Academically, these seemingly different domains are unified by mapping to [automata](https://en.wikipedia.org/wiki/Automata_theory). For us, we unify using a general-purpose matching engine. The matching engine is fully deterministic, so compilation of non-deterministic or ambiguous languages (such as regular expressions) involves picking an execution strategy or strategies. + +### Static compilation + +The most likely implementation strategy would be to statically compile (or partially compile) matching programs into a low-level bytecode. At run time, the bytecode is loaded, linked together (Swift supports separate compilation), and interpreted by a general-purpose matching engine. We'd likely want compile-time analysis and optimization as well as some limited form of run-time analysis and optimization for matching programs. + +A bytecode geared towards pattern matching can be significantly more compact than the corresponding executable machine code, especially when many applications on a system can share the same interpreter code. For some matching programs, especially the simpler or more performance-critical ones, statically compiling all the way to machine code is worthwhile. + +### Instruction Set + +ISA design can range from a small number of primitive operations which complex operations decompose to (RISC), to having complex operations encoded in a single instructions (CISC). We will likely want... a bit of both. + +Primitive operations are more general, in that novel operations can be added by decomposing to the primitive operations. These give us a better avenue for supporting unanticipated needs, such as by libraries or new features we add and would like to backport. + +Complex instructions aid efficiency, as we need to do fewer fetch-decode-execute iterations, and can avoid needing to store or refer to intermediary results. + +For example, `a*` can be represented many ways: + +``` +# e0 contains element "a" + +# Match "a" as many times as we can before falling through + + repeat e0 + +# Do the same with control flow and save points. `saveAddress` will resume execution +# from the given label upon match fail without touching the current position + + saveAddress(DONE) +START: + consume e0 + goto START +DONE: + ... + +# Do the same with just control flow, but doubly-inspecting the input (once for peek and once for advance) + +START: + peek b0, e0 + cond-branch !b0, DONE + advance + goto START +DONE: + ... + +``` + +We also will want the ability to call custom hooks, e.g. for composing with library calls. So, a matching program may include e.g. a list of elements to compare against and a list of closures to invoke. We'll want to version the bytecode and we can backport new operations via decomposition to primitives (also a good way to test and benchmark new operations). + +### Generalizing operations + +A generalization of a character class is `(Element) -> Bool` (throwing might be useful to signal a matching abort in contrast to a failure). + +A generalization of a match/consume operation is `(Input, Position) -> Position?`, for which assertions can be viewed as a special case. This requires having a notion of position and having bound a type for the input (e.g. some instance of `Collection`). For collections, this is basically the interface of `CollectionConsumer`, so it is also a means of composition, separate compilation, and a vector for backporting. + +A different generalization of match/consume is `(inout MatchingEngine) -> Bool`, where the engine has API for consuming, advancing, querying information about how it got there, and even interacting with save points used for backtracking. This works well as a generalization for monitors, which have no notion of position and which may want to query matching state. This has the downside of stabilizing an API for the engine. + +Of course, whenever possible we'd prefer to compile or link in bytecode for such things, but calling out to arbitrary Swift code is behavior we want to allow. + +## Frontends + +For string processing, this repo contains an example regex parser, compiler, and a few execution strategies. The regex parser itself serves as an example of writing parsers in Swift for simple languages. It hasn't (yet) been ported to run on the MatchingEngine, and doing so requires picking an execution strategy or strategies. It's AST can serve as the start of a simple result-builder API (tbd). + +For generic parsing, this repo contains an implementation of generic PEGs ("parsing expression grammars"). It is **not** a goal to ship PEGs specifically in Swift, but it **is** a goal to empower libraries providing parser formalisms such as PEGs. In the meantime, PEGs serve as a drop-in API for specifying parsers and can help stress the performance of generic code paths. + +We haven't added a representative example yet of low-level binary deserialization-like operations. We also haven't added a representative example of "bottoms-up" processing (e.g. with a cost function to select the best candidate). + +For event processing, this repo contains a formulation (but not yet implementation) of PTCaRet ("past time linear temporal logic with call and return"). It's a formal logic variant that's a little better suited for software systems than some others as it's oriented around making statements about *how* the program got to its current state. "Call" and "return" are not (necessarily, but could be!) Swift function call and returns, so much as a way of abstracting parts of a program trace similar to procedural abstraction in code. It is **not** a goal to ship this particular logic in Swift, but it **is** a goal to empower libraries to provide this kind of capability. In the meantime, it stresses our async story, ability to scale to virtually infinite histories, and how to enable custom code to react to and participate in pattern matching. + +We haven't added any particular parser combinator library or approach yet. + + +### Enhanced syntactic pattern matching over sets and collections + +Functional programming languages often support simple structural pattern matching against the head/tail of a list under a `cons`-like operation. + +One interesting bit of obscure functionality is so-called "ACI" matching (associativity, commutativity, and identity). For example, imagine matching against an Array using its associative `+` operator with identity `[]`: + +```swift +// Strawperson syntax +switch myArray { +case (let prefix) + [3, 4] + (let suffix): +``` + +which would successfully match the array `[1, 2, 3, 4, 5, 6]` and assign the `ArraySlice` `[1, 2]` to `prefix` and `[5, 6]` to `suffix`. Prefix and suffix can be empty, hence "identity" matching. + +Similarly, `OptionSet`'s `|` operator is associative and commutative with identity `[]`: + +```swift +// Strawperson syntax +switch myOptionSet { +case .specificValue | (let theRest): +``` + +which would match any set with `.specificValue` in it, binding everything else in the set to `theRest` (which again can be empty). + +When there are multiple terms to match, some of which might just be variables and thus not available statically, this kind of matching is [surprisingly complex][aci], though often fast in common cases. Compiling to the MatchingEngine could be an interesting future direction for the language. + +[milseman]: https://github.com/milseman +[aci]: https://www.sciencedirect.com/science/article/pii/S0747717187800275 diff --git a/Package.swift b/Package.swift new file mode 100644 index 000000000..4b3ea16dc --- /dev/null +++ b/Package.swift @@ -0,0 +1,59 @@ +// swift-tools-version:5.3 +// The swift-tools-version declares the minimum version of Swift required to build this package. + +import PackageDescription + +let package = Package( + name: "swift-experimental-string-processing", + products: [ + // Products define the executables and libraries produced by a package, and make them visible to other packages. + .library( + name: "Regex", + targets: ["Regex"]), + .library( + name: "PEG", + targets: ["PEG"]), + .library( + name: "MatchingEngine", + targets: ["MatchingEngine"]), + ], + dependencies: [ + // Dependencies declare other packages that this package depends on. + // .package(url: /* package url */, from: "1.0.0"), + ], + targets: [ + // Targets are the basic building blocks of a package. A target can define a module or a test suite. + // Targets can depend on other targets in this package, and on products in packages which this package depends on. + .target( + name: "Util", + dependencies: []), + .testTarget( + name: "UtilTests", + dependencies: ["Util"]), + .target( + name: "MatchingEngine", + dependencies: ["Util"]), + .testTarget( + name: "MatchingEngineTests", + dependencies: ["MatchingEngine"]), + .target( + name: "Regex", + dependencies: ["Util", "MatchingEngine"]), + .testTarget( + name: "RegexTests", + dependencies: ["Regex"]), + .target( + name: "PEG", + dependencies: ["Util", "MatchingEngine"]), + .testTarget( + name: "PEGTests", + dependencies: ["PEG", "Util"]), + .target( + name: "PTCaRet", + dependencies: ["Util", "MatchingEngine"]), + .testTarget( + name: "PTCaRetTests", + dependencies: ["PTCaRet", "Util"]), + ] +) + diff --git a/README.md b/README.md index 3e4e72a3a..e49a13ab4 100644 --- a/README.md +++ b/README.md @@ -1 +1,8 @@ -# swift-experimental-string-processing \ No newline at end of file +# MatchingEngine + +An early experimental general-purpose pattern matching engine for Swift. + +See [Declaratic String Processing Overview][decl-string] + +[decl-string]: Documentation/DeclarativeStringProcessing.md + diff --git a/Sources/Combinators/Combinators.swift b/Sources/Combinators/Combinators.swift new file mode 100644 index 000000000..b312a9b54 --- /dev/null +++ b/Sources/Combinators/Combinators.swift @@ -0,0 +1,219 @@ +import MatchingEngine + +/* + + Inspired by "Staged Parser Combinators for Efficient Data Processing" by Jonnalagedda et al. + + */ + +import Util + +// Stages are represented as nested namespaces that bind generic +// types +public enum Combinators { + public enum BindElement { + public enum BindPosition { + // TODO: it's not clear if error is bound earlier or + // later than the collection... + public enum BindError { + } + + public enum BindInput + where Input.Element == Element, Input.Index == Position { + + } + } + } +} + +extension Combinators.BindElement.BindPosition.BindError { + public struct ParseResult: Hashable { + var next: Position + var result: Result + } + + public struct Parser { + let apply: (Position) throws -> ParseResult + + public init(_ f: @escaping (Position) throws -> ParseResult) { + self.apply = f + } + } +} + +// Helpers +extension + Combinators.BindElement.BindPosition.BindError.ParseResult +{ + public typealias ParseResult = Combinators.BindElement.BindPosition.BindError.ParseResult + + public var value: T? { + switch result { + case .failure(_): return nil + case .success(let v): return v + } + } + + public var isError: Bool { value == nil } + + // Paper does this, not sure if we want to distinguish + // successful empty parses from error parses, and if + // we want error recovery to involve skipping... + public var isEmpty: Bool { isError } + + public func mapValue(_ f: (T) -> U) -> ParseResult { + ParseResult(next: next, result: result.map(f)) + } + + public func flatMap( + _ f: (Position, T) throws -> ParseResult + ) rethrows -> ParseResult { + switch result { + case .success(let v): + return try f(next, v) + case .failure(let e): + return ParseResult(next: next, result: .failure(e)) + } + } + + public var successSelf: Self? { + guard !isError else { return nil } + return self + } + + public var errorSelf: Self? { + guard isError else { return nil } + return self + } +} + + +// Combinators +extension Combinators.BindElement.BindPosition.BindError.Parser { + typealias Parser = Combinators.BindElement.BindPosition.BindError.Parser + typealias ParseResult = Combinators.BindElement.BindPosition.BindError.ParseResult + + // Backtracking alternation + public func or(_ rhs: Self) -> Self { + Self { pos in + try self.apply(pos).successSelf ?? rhs.apply(pos) + } + } + + public func map( + _ f: @escaping (T) -> U + ) -> Parser { + Parser { pos in + try self.apply(pos).mapValue(f) + } + } + + public func flatMap( + _ f: @escaping (T) -> Parser + ) -> Parser { + return Parser { pos in + try self.apply(pos).flatMap { (p, v) in + try f(v).apply(p) + } + } + } + + public func chain( + _ rhs: Parser, combining f: @escaping (T, U) -> V + ) -> Parser { + Parser { pos in + try self.apply(pos).flatMap { p, t in + try rhs.apply(p).mapValue { u in f(t, u) } + } + } + } + + + public func chain( + _ rhs: Parser + ) -> Parser> { + self.chain(rhs) { Pair($0, $1) } + } + + public func chainLeft( + _ rhs: Parser + ) -> Parser { + self.chain(rhs) { r, _ in r } + } + public func chainRight( + _ rhs: Parser + ) -> Parser { + self.chain(rhs) { _, r in r } + } + + public var `repeat`: Parser<[T]> { + // TODO: non-primitive construction + Parser { pos in + var pos = pos + var result = Array() + while let intr = try self.apply(pos).successSelf { + pos = intr.next + result.append(intr.value!) + } + return ParseResult(next: pos, result: .success(result)) + } + } + + public func `repeat`(exactly n: Int) -> Parser<[T]> { + // TODO: non-primitive construction + Parser { pos in + var pos = pos + var result = Array() + for _ in 0..: Hashable { + var first: T + var second: U + + init(_ t: T, _ u: U) { + self.first = t + self.second = u + } +} + + +/* + + Extract HTTP response body + + def status = ( + ("HTTP/" ~ decimalNumber) ~> wholeNumber <~ (text ~ crlf) + ) map (_.toInt) + + def headers = rep(header) + + def header = (headerName <~ ":") flatMap { key => + (valueParser(key) <~ crlf) map { value => (key, value) } + } + + def valueParser(key: String) = + if (key == "Content-Length") wholeNumber else text + + def body(i: Int) = repN(anyChar, i) <~ crlf + + def response = (status ~ headers <~ crlf) map { + case st ~ hs => Response(st, hs) + } + + def respWithPayload = response flatMap { r => + body(r.contentLength) + } + + */ diff --git a/Sources/MatchingEngine/Builder.swift b/Sources/MatchingEngine/Builder.swift new file mode 100644 index 000000000..ea6d1f37b --- /dev/null +++ b/Sources/MatchingEngine/Builder.swift @@ -0,0 +1,158 @@ +import Util + +extension Program where Element: Hashable { + public struct Builder { + var instructions = Array() + + var elements = TypedSetVector() + var strings = TypedSetVector() + + // Map tokens to actual addresses + var addressTokens = Array() + var addressFixups = Array<(InstructionAddress, AddressToken)>() + + // Registers + var nextBoolRegister = BoolRegister(0) + + public init() {} + } +} + +extension Program.Builder { + public init(staticElements: S) where S.Element == Element { + staticElements.forEach { elements.store($0) } + } + + public mutating func buildNop(_ r: StringRegister? = nil) { + instructions.append(.nop(r)) + } + public mutating func buildNop(_ s: String) { + buildNop(strings.store(s)) + } + + public mutating func buildBranch(to t: AddressToken) { + instructions.append(.branch()) + fixup(to: t) + } + public mutating func buildCondBranch( + _ condition: BoolRegister, to t: AddressToken + ) { + instructions.append(.condBranch(condition: condition)) + fixup(to: t) + } + + public mutating func buildSave(_ t: AddressToken) { + instructions.append(.save()) + fixup(to: t) + } + public mutating func buildSaveAddress(_ t: AddressToken) { + instructions.append(.saveAddress()) + fixup(to: t) + } + + public mutating func buildClear() { + instructions.append(.clear()) + } + public mutating func buildRestore() { + instructions.append(.restore()) + } + public mutating func buildFail() { + instructions.append(.fail()) + } + public mutating func buildCall(_ t: AddressToken) { + instructions.append(.call()) + fixup(to: t) + } + public mutating func buildRet() { + instructions.append(.ret()) + } + + public mutating func buildAbort(_ s: StringRegister? = nil) { + instructions.append(.abort(s)) + } + public mutating func buildAbort(_ s: String) { + buildAbort(strings.store(s)) + } + + public mutating func buildConsume(_ n: Distance) { + instructions.append(.consume(n)) + } + + public mutating func buildMatch(_ e: Element) { + instructions.append(.match(elements.store(e))) + } + + public mutating func buildAssert(_ e: Element, into c: BoolRegister) { + instructions.append(.assertion(condition: c, elements.store(e))) + } + + public mutating func buildAccept() { + instructions.append(.accept()) + } + + public mutating func buildPrint(_ s: StringRegister) { + instructions.append(.print(s)) + } + + public func assemble() -> Program { + // Do a pass to map address tokens to addresses + var instructions = instructions + for (instAddr, tok) in addressFixups { + instructions[instAddr.rawValue].operand.initializePayload( + addressTokens[tok.rawValue]! + ) + } + + var regInfo = Program.RegisterInfo() + regInfo.elements = elements.count + regInfo.strings = strings.count + regInfo.bools = nextBoolRegister.rawValue + + return Program( + instructions: InstructionList(instructions), + staticElements: elements.stored, + staticStrings: strings.stored, + registerInfo: regInfo) + } + + public mutating func reset() { self = Self() } +} + +// Address-agnostic interfaces for label-like support +extension Program.Builder { + public enum _AddressToken {} + public typealias AddressToken = TypedInt<_AddressToken> + + public mutating func createAddress() -> AddressToken { + defer { addressTokens.append(nil) } + return AddressToken(addressTokens.count) + } + + // Resolves the address token to the most recently added + // instruction, updating prior and future address references + public mutating func resolve(_ t: AddressToken) { + assert(!instructions.isEmpty) + assert(addressTokens[t.rawValue] == nil) + + addressTokens[t.rawValue] = + InstructionAddress(instructions.count &- 1) + } + + // Associate the most recently added instruction with + // the provided token, ensuring it is fixed up during + // assembly + public mutating func fixup(to t: AddressToken) { + assert(!instructions.isEmpty) + addressFixups.append( + (InstructionAddress(instructions.endIndex-1), t)) + } +} + +// Register helpers +extension Program.Builder { + public mutating func createRegister() -> BoolRegister { + defer { nextBoolRegister.rawValue += 1 } + return nextBoolRegister + } +} + diff --git a/Sources/MatchingEngine/Consume.swift b/Sources/MatchingEngine/Consume.swift new file mode 100644 index 000000000..da9d25a29 --- /dev/null +++ b/Sources/MatchingEngine/Consume.swift @@ -0,0 +1,36 @@ +var checkComments = true + +extension Engine { + func createProcessor(_ input: Input) -> Processor { + Processor(program, input, enableTracing: enableTracing) + } + + public func consume(_ input: Input) -> Input.Index? { + if enableTracing { + print("Consume: \(input)") + } + + var cpu = createProcessor(input) + let result: Input.Index? = { + while true { + switch cpu.state { + case .accept: + return cpu.currentPosition + case .fail: + return nil + case .inprogress: cpu.cycle() + } + } + }() + + if enableTracing { + if let idx = result { + print("Result: \(input[.. where Input.Element: Hashable { + + var program: Program + + // TODO: Pre-allocated register banks + + var instructions: InstructionList { program.instructions } + + var enableTracing: Bool { + get { program.enableTracing } + set { program.enableTracing = newValue } + } + + public init( + _ program: Program, + enableTracing: Bool? = nil + ) { + var program = program + if let t = enableTracing { + program.enableTracing = t + } + self.program = program + } +} + +public struct AsyncEngine { /* ... */ } + +extension Engine: CustomStringConvertible { + public var description: String { + // TODO: better description + return program.description + } +} diff --git a/Sources/MatchingEngine/Instruction.swift b/Sources/MatchingEngine/Instruction.swift new file mode 100644 index 000000000..5c77122c6 --- /dev/null +++ b/Sources/MatchingEngine/Instruction.swift @@ -0,0 +1,387 @@ +import Util + +enum State { + /// Still running + case inprogress + + /// FAIL: halt and signal failure + case fail + + /// ACCEPT: halt and signal success + case accept +} +// TODO: better names for accept/fail/etc. Instruction +// conflates backtracking with signaling failure or success, +// could be clearer. + + +// TODO: Save point and call stack interactions should be more formalized. +// It's too easy to have unbalanced save/clears amongst function calls + +enum OpCode: UInt64 { + case invalid = 0 + + /// Do nothing + /// + /// Operand: optional string register containing a comment + case nop + + // MARK: - Control flow + + /// Branch to a new instruction + /// + /// Operand: instruction address to branch to + case branch + + /// Conditionally branch + /// + /// Operand: packed condition register and address to branch to + case condBranch + + // MARK: - Save points (e.g. for backtracking) + + /// Add a save point + /// + /// Operand: instruction address to resume from + /// + /// A save point is: + /// - a position in the input to restore + /// - a position in the call stack to cut off + /// - an instruction address to resume from + /// + /// TODO: Consider if separating would improve generality + case save + + /// + /// Add a save point that doesn't preserve input position + /// + /// NOTE: This is a prototype for now, but exposes + /// flaws in our formulation of back tracking. We could + /// instead have an instruction to update the top + /// most saved position instead + case saveAddress + + /// Remove the most recently saved point + /// + /// Precondition: There is a save point to remove + case clear + + /// View the most recently saved point + /// + /// UNIMPLEMENTED + case peek + + /// Composite peek-branch-clear else FAIL + case restore + + // MARK: - Function call stack + + /// Push an instruction address to the stack + /// + /// Operand: the instruction address + /// + /// UNIMPLEMENTED + case push + + /// Pop return address from call stack + /// + /// UNIMPLEMENTED + case pop + + /// Composite push-next-branch instruction + /// + /// Operand: the function's start address + case call + + /// Composite pop-branch instruction + /// + /// Operand: the instruction address + /// + /// NOTE: Currently, empty stack -> ACCEPT + case ret + + // MARK: - State transitions + + // TODO: State transitions need more work. We want + // granular core but also composite ones that will + // interact with save points + + /// Transition into ACCEPT and halt + case accept + + /// Signal failure (currently same as `restore`) + case fail + + /// Halt, fail, and signal failure + /// + /// Operand: optional string register specifying the reason + /// + /// TODO: Could have an Error existential area instead + case abort + + // MARK: - Interact with the input + + /// Advance our input position + /// + /// Operand: amount to advance by + case consume + + // TODO: assert, hooks, etc + + /// Composite assert-consume else restore + /// + /// Operand: Element register to compare against + case match + + /// Match against a provided element + /// + /// Operand: Packed condition register to write to and element register to compare against + case assertion + + // TODO: Fused assertions. It seems like we often want to + // branch based on assertion fail or success. + + + // MARK: - Debugging instructions + + /// Print a string to the output + /// + /// Operand: String register + case print + + /// Custom consumption operation + /// + /// Operand: consume hook register + static var consumeHook: OpCode { fatalError() } + + /// Custom assertion operation + /// + /// Operands: destination bool register, assert hook register + static var assertHook: OpCode { fatalError() } + + // ... + + +} +// TODO: Instructions for interacting with the various +// registers and stack + +// TODO: Nominal type for conditions, which can have an invert bit +// set + +// TODO: Better bit allocation for operand. Consider having an +// address and register number concept: addresses get ~48bits while +// registers get ~16 bits currently. + +// TODO: pack in a discriminator so we can assert on types + +// TODO: see if switching on top or bottom byte is better + +// TODO: store relative offsets for instructions, allows for +// smaller bit-width addresses and arbitrary length programs +// using jump islands + +// TODO: Explore if predication bit (or full register) would +// make it more feasible to SIMD some common programs. + +// +// +// Internal NOTE: Currently stored 1-biased so that we can +// provide assertions when this is accessed incorrectly. +// Likely to remove later. +// +// TODO: Consider hoisting the bias and un-bias up into Instruction +struct Operand: RawRepresentable { + // Store conditions in high bits, rest in low bits + var rawValue: UInt64 + + init(rawValue: UInt64) { + self.rawValue = rawValue + } + init<👻>( + condition: BoolRegister? = nil, + _ payload: TypedInt<👻>? = nil + ) { + self.rawValue = 0 + if let c = condition { + assert(c < 65_536) // How do I exponentiate in Swift?... + self.rawValue |= (c.bits&+1) &<< 48 + } + if let p = payload { initializePayload(p) } + } + init() { + // Workaround: have to invent phantom type + let payload: TypedInt<_PositionStackAddressRegister>? = nil + self.init(condition: nil, payload) + } + + var payloadMask: UInt64 { _payloadMask() } + + // Weird workaround: I want my masks to just be the literals, + // semantically similar to if I had pasted this value into the + // source code. Swift doesn't support generic vars, so we do + // this + // + // NOTE: Is Operand's un-descriminated union similar? + func _payloadMask< + I: ExpressibleByIntegerLiteral + >() -> I { + 0x0000_FFFF_FFFF_FFFF + } + + var hasPayload: Bool { payloadBits > 0 } + var payloadBits: UInt64 { rawValue & payloadMask } + + func payload<👻>( + as ty: TypedInt<👻>.Type = TypedInt<👻>.self + ) -> TypedInt<👻> { + assert(hasPayload) + return TypedInt(payloadBits &- 1) + } + + mutating func initializePayload<👻>(_ value: TypedInt<👻>) { + assert(!hasPayload) + assert(value < _payloadMask()) + self.rawValue |= (value.bits&+1) + } + + var hasCondition: Bool { conditionBits > 0 } + var conditionBits: UInt64 { (rawValue & ~payloadMask) &>> 48 } + + var condition: BoolRegister { + assert(hasCondition) + return BoolRegister(conditionBits &- 1) + } + +} + +struct Instruction: RawRepresentable { + var rawValue: UInt64 + + var opcodeMask: UInt64 { 0xFF00_0000_0000_0000 } + + var opcode: OpCode { + get { + OpCode( + rawValue: (rawValue & opcodeMask) &>> 56 + ).unsafelyUnwrapped + } + set { + assert(newValue != .invalid, "consider hoisting this") + assert(newValue.rawValue < 256) + self.rawValue &= ~opcodeMask + self.rawValue |= newValue.rawValue &<< 56 + } + } + var operand: Operand { + get { Operand(rawValue: rawValue & ~opcodeMask) } + set { + assert(newValue.rawValue & opcodeMask == 0) + self.rawValue &= opcodeMask + self.rawValue |= newValue.rawValue + } + } + + var destructure: (opcode: OpCode, operand: Operand) { + get { (opcode, operand) } + set { self = Self(opcode, operand) } + } + + init(rawValue: UInt64){ + self.rawValue = rawValue + } + init(_ opcode: OpCode, _ operand: Operand = Operand()) { + self.init(rawValue: 0) + self.opcode = opcode + self.operand = operand + } +} +extension Instruction { + static func nop(_ s: StringRegister? = nil) -> Instruction { + Instruction(.nop, Operand(s)) + } + static func branch(to: InstructionAddress? = nil) -> Instruction { + Instruction(.branch, Operand(to)) + } + static func condBranch(condition: BoolRegister? = nil, to: InstructionAddress? = nil) -> Instruction { + Instruction(.condBranch, Operand(condition: condition, to)) + } + static func save(resumingFrom: InstructionAddress? = nil) -> Instruction { + Instruction(.save, Operand(resumingFrom)) + } + static func saveAddress(resumingFrom: InstructionAddress? = nil) -> Instruction { + Instruction(.saveAddress, Operand(resumingFrom)) + } + static func clear() -> Instruction { + Instruction(.clear) + } + static func restore() -> Instruction { + Instruction(.restore) + } + static func fail() -> Instruction { + Instruction(.fail) + } + static func call(start: InstructionAddress? = nil) -> Instruction { + Instruction(.call, Operand(start)) + } + static func ret() -> Instruction { + Instruction(.ret, Operand()) + } + static func abort(_ s: StringRegister? = nil) -> Instruction { + Instruction(.abort, Operand(s)) + } + static func accept() -> Instruction { + Instruction(.accept) + } + static func consume(_ n: Distance? = nil) -> Instruction { + Instruction(.consume, Operand(n)) + } + static func match(_ e: ElementRegister? = nil) -> Instruction { + Instruction(.match, Operand(e)) + } + static func assertion( + condition: BoolRegister? = nil, _ e: ElementRegister? = nil + ) -> Instruction { + Instruction(.assertion, Operand(condition: condition, e)) + } + static func print(_ s: StringRegister? = nil) -> Instruction { + Instruction(.match, Operand(s)) + } +} + +extension Instruction { + var stringRegister: StringRegister? { + switch opcode { + case .nop: fallthrough + case .abort: fallthrough + case .print: + return operand.hasPayload ? operand.payload() : nil + default: return nil + } + } + var instructionAddress: InstructionAddress? { + switch opcode { + case .branch: fallthrough + case .condBranch: fallthrough + case .save: fallthrough + case .saveAddress: fallthrough + case .call: + return operand.hasPayload ? operand.payload() : nil + default: return nil + } + } + var elementRegister: ElementRegister? { + switch opcode { + case .match: fallthrough + case .assertion: + return operand.hasPayload ? operand.payload() : nil + default: return nil + } + } + +} + +extension Instruction: InstructionProtocol { + var operandPC: InstructionAddress? { instructionAddress } +} + diff --git a/Sources/MatchingEngine/MatchingAPI.swift b/Sources/MatchingEngine/MatchingAPI.swift new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/Sources/MatchingEngine/MatchingAPI.swift @@ -0,0 +1 @@ + diff --git a/Sources/MatchingEngine/Processor.swift b/Sources/MatchingEngine/Processor.swift new file mode 100644 index 000000000..3fc99834d --- /dev/null +++ b/Sources/MatchingEngine/Processor.swift @@ -0,0 +1,215 @@ +import Util + +/// A concrete CU. Somehow will run the concrete logic and +/// feed stuff back to generic code +struct Controller { + var pc: InstructionAddress + + mutating func step() { + pc.rawValue += 1 + } +} + +struct Processor< + Input: Collection +> where Input.Element: Equatable { // maybe Hashable? + typealias Element = Input.Element + + let input: Input + var currentPosition: Position + + let instructions: InstructionList + var controller: Controller + + var cycleCount = 0 + + /// Our register file + var registers: Registers + + // Used for back tracking + var savePoints = Array<(SavePoint, stackEnd: Int)>() + + var callStack = Array() + + var state: State = .inprogress + + var enableTracing: Bool +} + + +extension Processor { + typealias Position = Input.Index + + // TODO: What all do we want to save? Configurable? + struct SavePoint { + var pc: InstructionAddress + var pos: Position? + + var destructure: (pc: InstructionAddress, pos: Position?) { + (pc, pos) + } + } +} + +extension Processor { + init( + _ program: Program, + _ input: Input, + enableTracing: Bool = false + ) { + self.controller = Controller(pc: 0) + self.instructions = program.instructions + self.input = input + self.enableTracing = enableTracing + self.currentPosition = input.startIndex + + self.registers = Registers(program, input.endIndex) + } +} + +extension Processor { + // Advance in our input + mutating func consume(_ n: Distance) { + // Want Collection to provide this behavior... + if input.distance(from: currentPosition, to: input.endIndex) < n.rawValue { + signalFailure() + return + } + currentPosition = input.index(currentPosition, offsetBy: n.rawValue) + } + + func doPrint(_ s: String) { + var enablePrinting: Bool { false } + if enablePrinting { + print(s) + } + } + + func load() -> Element? { + currentPosition < input.endIndex ? input[currentPosition] : nil + } + + mutating func signalFailure() { + guard let (thread, stackEnd) = savePoints.popLast() else { + state = .fail + return + } + assert(stackEnd <= callStack.count) + controller.pc = thread.pc + currentPosition = thread.pos ?? currentPosition + callStack.removeLast(callStack.count - stackEnd) + } + + mutating func cycle() { + assert(state == .inprogress) + if cycleCount == 0 { trace() } + defer { + cycleCount += 1 + trace() + } + let (opcode, operand) = fetch().destructure + switch opcode { + case .invalid: + fatalError("Invalid program") + case .nop: + if checkComments, operand.hasPayload { + doPrint(registers[operand.payload(as: StringRegister.self)]) + } + controller.step() + + case .branch: + controller.pc = operand.payload() + + case .condBranch: + if registers[operand.condition] { + controller.pc = operand.payload() + } else { + controller.step() + } + + case .save: + savePoints.append( + (SavePoint(pc: operand.payload(), pos: currentPosition), callStack.count)) + controller.step() + + case .saveAddress: + savePoints.append( + (SavePoint(pc: operand.payload(), pos: nil), callStack.count)) + controller.step() + + case .clear: + if let _ = savePoints.popLast() { + controller.step() + } else { + fatalError("TODO: What should we do here?") + } + + case .peek: + fatalError() + + case .restore: + signalFailure() + + case .push: + fatalError() + + case .pop: + fatalError() + + case .call: + controller.step() + callStack.append(controller.pc) + controller.pc = operand.payload() + + case .ret: + // TODO: Should empty stack mean success? + guard let r = callStack.popLast() else { + state = .accept + return + } + controller.pc = r + + case .abort: + // TODO: throw or otherwise propagate + doPrint(registers[operand.payload(as: StringRegister.self)]) + state = .fail + return + + case .accept: + state = .accept + return + + case .fail: + signalFailure() + + case .consume: + consume(operand.payload(as: Distance.self)) + controller.step() + + case .match: + let reg = operand.payload(as: ElementRegister.self) + guard let cur = load(), cur == registers[reg] else { + signalFailure() + return + } + consume(1) + controller.step() + + case .print: + // TODO: Debug stream + doPrint(registers[operand.payload(as: StringRegister.self)]) + + case .assertion: + let reg = operand.payload(as: ElementRegister.self) + var result: Bool + if let cur = load(), cur == registers[reg] { + result = true + } else { + result = false + } + registers[operand.condition] = result + controller.step() + } + } +} + diff --git a/Sources/MatchingEngine/Program.swift b/Sources/MatchingEngine/Program.swift new file mode 100644 index 000000000..a9afaf5d5 --- /dev/null +++ b/Sources/MatchingEngine/Program.swift @@ -0,0 +1,37 @@ +import Util + +public struct Program where Element: Equatable { + var instructions: InstructionList + + var staticElements: Array + var staticStrings: Array + + var registerInfo: RegisterInfo + + var enableTracing: Bool = false +} + +extension Program: CustomStringConvertible { + public var description: String { + var result = """ + Elements: \(staticElements) + Strings: \(staticStrings) + + """ + + // TODO: Extract into formatting code + + for idx in instructions.indices { + let inst = instructions[idx] + result += "[\(idx.rawValue)] \(inst)" + if let sp = inst.stringRegister { + result += " // \(staticStrings[sp.rawValue])" + } + if let ia = inst.instructionAddress { + result += " // \(instructions[ia])" + } + result += "\n" + } + return result + } +} diff --git a/Sources/MatchingEngine/Registers.swift b/Sources/MatchingEngine/Registers.swift new file mode 100644 index 000000000..da23caac6 --- /dev/null +++ b/Sources/MatchingEngine/Registers.swift @@ -0,0 +1,124 @@ +import Util + +extension Processor { + /// Our register file + struct Registers { + // currently, these are static readonly + var elements: Array + + // currently, hold output of assertions + var bools: Array // TODO: bitset + + // currently, these are for comments and abort messages + var strings: Array + + // unused + var ints = Array() + + // unused + var floats = Array() + + // unused + // + // Unlikely to be static, as that means input must be bound + // at compile time + var positions = Array() + + // unused + var instructionAddresses = Array() + + // unused, any application? + var classStackAddresses = Array() + + // unused, any application? + var positionStackAddresses = Array() + + // unused, any application? + var savePointAddresses = Array() + + subscript(_ i: StringRegister) -> String { + strings[i.rawValue] + } + subscript(_ i: BoolRegister) -> Bool { + get { bools[i.rawValue] } + set { bools[i.rawValue] = newValue } + } + subscript(_ i: ElementRegister) -> Element { + elements[i.rawValue] + } + } +} + +extension Processor.Registers { + init( + _ program: Program, + _ sentinel: Input.Index + ) { + let info = program.registerInfo + + self.elements = program.staticElements + assert(elements.count == info.elements) + + self.strings = program.staticStrings + assert(strings.count == info.strings) + + self.bools = Array(repeating: false, count: info.bools) + + self.ints = Array(repeating: 0, count: info.ints) + + self.floats = Array(repeating: 0, count: info.floats) + + self.positions = Array(repeating: sentinel, count: info.positions) + + self.instructionAddresses = Array(repeating: 0, count: info.instructionAddresses) + + self.classStackAddresses = Array(repeating: 0, count: info.classStackAddresses) + + self.positionStackAddresses = Array(repeating: 0, count: info.positionStackAddresses) + + self.savePointAddresses = Array(repeating: 0, count: info.savePointAddresses) + } +} + +extension Program { + struct RegisterInfo { + var elements = 0 + var bools = 0 + var strings = 0 + var ints = 0 + var floats = 0 + var positions = 0 + var instructionAddresses = 0 + var classStackAddresses = 0 + var positionStackAddresses = 0 + var savePointAddresses = 0 + } +} + +extension Processor.Registers: CustomStringConvertible { + var description: String { + func formatRegisters( + _ name: String, _ regs: Array + ) -> String { + // TODO: multi-line if long + if regs.isEmpty { return "" } + + return "\(name): \(regs)\n" + } + + return """ + \(formatRegisters("elements", elements))\ + \(formatRegisters("bools", bools))\ + \(formatRegisters("strings", strings))\ + \(formatRegisters("ints", ints))\ + \(formatRegisters("floats", floats))\ + \(formatRegisters("positions", positions))\ + \(formatRegisters("instructionAddresses", instructionAddresses))\ + \(formatRegisters("classStackAddresses", classStackAddresses))\ + \(formatRegisters("positionStackAddresses", positionStackAddresses))\ + \(formatRegisters("savePointAddresses", savePointAddresses))\ + + """ + } +} + diff --git a/Sources/MatchingEngine/Tracing.swift b/Sources/MatchingEngine/Tracing.swift new file mode 100644 index 000000000..12a1481bf --- /dev/null +++ b/Sources/MatchingEngine/Tracing.swift @@ -0,0 +1,34 @@ +import Util + +extension Processor: TracedProcessor { + var isFailState: Bool { state == .fail } + var isAcceptState: Bool { state == .accept } + + var currentPC: InstructionAddress { controller.pc } +} + +extension Instruction: CustomStringConvertible { + var description: String { + "\(opcode) \(operand)" + } +} + +extension Operand: CustomStringConvertible { + var description: String { + var result = "" + if hasCondition { + result += "\(condition) " + } + if hasPayload { + let payload: TypedInt<_Boo> = payload() + result += payload.description + } + return result + } +} + +extension Processor.SavePoint: CustomStringConvertible { + var description: String { + String(describing: self.destructure) + } +} diff --git a/Sources/PEG/PEG.swift b/Sources/PEG/PEG.swift new file mode 100644 index 000000000..4715992ef --- /dev/null +++ b/Sources/PEG/PEG.swift @@ -0,0 +1,134 @@ +public enum PEG {} + +extension PEG { + enum Pattern { + /// Match any element + case any + + /// Match succeeds + case success + + /// Match fails + case failure + + /// Match a specific element + case element(Element) + + /// Match one of many potential elements + case charactetSet((Element) -> Bool) + + /// A literal sequence of elements + case literal(Array) + + /// Try `p1` first, and only if it doesn't work, backtrack and try `p2` + indirect case orderedChoice(Pattern, Pattern) + + /// Try each pattern in succession + indirect case concat([Pattern]) + + /// `p1 - p2` == `!p2 p1`, i.e. match `p1` so long as `p2` is not true + indirect case difference(Pattern, Pattern) + + /// Repeat a pattern at least `n` times + indirect case `repeat`(Pattern, atLeast: Int) + indirect case repeatRange(Pattern, atLeast: Int, atMost: Int) + + /// Match if `p1` matches, but does not consume input + indirect case and(Pattern) + + /// Match if `p1` does not match. Does not consume input + indirect case not(Pattern) + + /// Capture `p1` + indirect case capture(Pattern) + + /// Reference a declared variable (e.g. for recursive patterns) + case variable(String) + + + // Some conveniences + + /// The end of input + /// + /// .end == .not(.any) == assertion { $1 == $0.endIndex } + case end + + static func many(_ p: Pattern) -> Pattern { + .repeat(p, atLeast: 0) + } + static func oneOrMore(_ p: Pattern) -> Pattern { + .repeat(p, atLeast: 1) + } + static func range( + _ re: RE + ) -> Pattern where RE.Bound == Element { + .charactetSet({ re.contains($0) }) + } + } + + struct Production { + let name: String + let pattern: Pattern + + var destructure: (name: String, pattern: Pattern) { + (name, pattern) + } + } + + // Environment is, effectively, a list of productions + typealias Environment = Dictionary + + struct Program { + let start: String + let environment: Environment + + func checkInvariants() { + assert(environment[start] != nil) + } + + var entry: Production { + Production(name: start, pattern: environment[start]!) + } + + var destructure: (start: String, environment: Environment) { + (start, environment) + } + } +} + +extension PEG.Pattern: CustomStringConvertible { + var description: String { + switch self { + case .any: return "" + case .success: return "" + case .failure: return "" + case .element(let e): return "'\(e)'" + + case .charactetSet(let s): + return "\(String(describing: s))" + + case .literal(let l): + return "'\(l.map { "\($0)" }.joined())'" + + case .orderedChoice(let lhs, let rhs): + return "(\(lhs) | \(rhs))" + + case .concat(let s): + return s.map { "\($0)" }.joined(separator: " ") + + case .difference(let lhs, let rhs): return "(\(lhs) - \(rhs))" + + case .repeat(let p, let atLeast): + return "" + + case .repeatRange(let p, let atLeast, let atMost): + return "" + + case .and(let p): return "&(\(p))" + case .not(let p): return "!(\(p))" + case .capture(let p): return "" + case .variable(let v): return "\(v)" + case .end: return "" + } + } +} diff --git a/Sources/PEG/PEGCode.swift b/Sources/PEG/PEGCode.swift new file mode 100644 index 000000000..ce74ac269 --- /dev/null +++ b/Sources/PEG/PEGCode.swift @@ -0,0 +1,150 @@ +import Util + +extension PEG.VM { + struct Code { + var functions: Array + + var start: Function { functions[0] } + + // TODO: Capture metadata + + func lookup(_ f: FunctionId) -> Function { + functions[f.rawValue] + } + } +} + + +extension PEG.VM.Code { + enum Instruction { + case nop + + case comment(String) + + // Advance the InIndex by a fixed amount of positions + case consume(Int) + + // Advance the InIndex by a dynamic amount of positions + //case advance(Register) should we have non-bool registers? + + // TODO: Matching vs asserting... + + // Match and consume + case element(Element) + case set((Element) -> Bool) + case any + + // Control flow + case branch(to: LabelId) + case condBranch(condition: BoolRegister, to: LabelId) + case label(LabelId) // TODO: separate out + + // Function calls + case call(FunctionId) + case ret + + // Backtracking (TODO: should this be explicit slots or implicit stack?) + case save(restoringAt: LabelId) +// case restore + case commit(continuingAt: LabelId) + + // Capture + case startCapture + case endCapture + + // TODO: Consider captures an PC/SP pair, requires ability to + // save / retrieve SPs and a commit-capture instruction. + + // Terminate + case accept + case fail + case abort + + } + +} + +extension PEG.VM.Code { + struct Function { + let name: String + var instructions: InstructionList + + init(name: String) { + self.name = name + self.instructions = [.comment(name)] + } + + // Label location metadata + // TODO: Array permitting uninitialized values + var labels: Dictionary = [:] + + // TODO: Do we want to represent capture metadata? + + func lookup(_ p: InstructionAddress) -> Instruction { instructions[p] } + func lookup(_ l: LabelId) -> InstructionAddress { labels[l]! } + + mutating func add(_ inst: Instruction) { + if case .label = inst { + assertionFailure("Compilation error: label instruction") + } + instructions.append(inst) + } + mutating func addLabel(_ id: LabelId) { + labels[id] = InstructionAddress(instructions.count) + instructions.append(.label(id)) + } + } +} + +extension PEG.VM.Code.Instruction: CustomStringConvertible { + var description: String { + switch self { + case .nop: return "" + case .consume(let i): return "" + case .element(let e): return "" + case .set(let s): return "" + case .any: return "" + case .branch(let to): return "" + case .condBranch(let condition, let to): + return "" + case .label(let l): return "