Skip to content

Commit 8d5f5a5

Browse files
author
Dave Abrahams
committed
[stdlib] StringProto: UnicodeScalars and ExtendedASCII views...
...for _FixedFormatUnicode. I just realized I have O(N^2) indexing in my type-erased wrappers, so that's next!
1 parent 83a4d0f commit 8d5f5a5

File tree

4 files changed

+219
-7
lines changed

4 files changed

+219
-7
lines changed

stdlib/public/core/AnyUnicode.swift

Lines changed: 86 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
1010
//
1111
//===----------------------------------------------------------------------===//
12+
13+
//===--- TODO -------------------------------------------------------------===//
14+
//
15+
//===----------------------------------------------------------------------===//
1216
public protocol _AnyUnicode {
1317
var encoding: AnyUnicodeEncoding.Type { get }
1418

@@ -56,6 +60,19 @@ public protocol _FixedFormatUnicode : _AnyUnicode {
5660
// where Iterator.Element == Character
5761

5862
var characters: CharacterView { get }
63+
64+
/// A type that presents the string's unicode scalar values
65+
associatedtype UnicodeScalarView : BidirectionalCollection
66+
// where Iterator.Element == UnicodeScalar
67+
68+
var unicodeScalars: UnicodeScalarView { get }
69+
70+
/// A type presenting ASCII unicode scalar values verbatim, and otherwise
71+
/// presenting values >= 128, which is outside the range of ASCII.
72+
associatedtype ExtendedASCIIView : BidirectionalCollection = CodeUnits
73+
// where Iterator.Element : UnsignedInteger
74+
75+
var extendedASCII: ExtendedASCIIView { get }
5976
}
6077

6178
/// Default views
@@ -72,15 +89,62 @@ where
7289
}
7390
}
7491

92+
// UTF32 gets a default UnicodeScalarView that injects replacement characters
93+
// for illegal scalar values
94+
public extension _FixedFormatUnicode
95+
where
96+
Encoding == UTF32,
97+
CodeUnits.Iterator.Element == Encoding.EncodedScalar.Iterator.Element,
98+
CodeUnits.Iterator.Element : UnsignedInteger,
99+
CodeUnits.SubSequence : RandomAccessCollection,
100+
CodeUnits.SubSequence.Index == CodeUnits.Index,
101+
CodeUnits.SubSequence.SubSequence == CodeUnits.SubSequence,
102+
CodeUnits.SubSequence.Iterator.Element == CodeUnits.Iterator.Element {
103+
104+
var unicodeScalars: LazyMapCollection<CodeUnits, UnicodeScalar> {
105+
return codeUnits.lazy.map {
106+
UnicodeScalar($0)
107+
?? UnicodeScalar(_unchecked: 0xFFFD)
108+
}
109+
}
110+
}
111+
112+
// Everybody else gets a UnicodeScalarView based on transcoding to UTF32, which
113+
// already makes any necessary corrections.
114+
public extension _FixedFormatUnicode
115+
where
116+
CodeUnits.Iterator.Element == Encoding.EncodedScalar.Iterator.Element,
117+
CodeUnits.Iterator.Element : UnsignedInteger,
118+
CodeUnits.SubSequence : RandomAccessCollection,
119+
CodeUnits.SubSequence.Index == CodeUnits.Index,
120+
CodeUnits.SubSequence.SubSequence == CodeUnits.SubSequence,
121+
CodeUnits.SubSequence.Iterator.Element == CodeUnits.Iterator.Element {
122+
123+
var unicodeScalars: LazyMapBidirectionalCollection<
124+
UnicodeStorage<CodeUnits,Encoding>.ScalarsTranscoded<UTF32>
125+
, UnicodeScalar
126+
> {
127+
return UnicodeStorage(codeUnits, Encoding.self)
128+
.scalarsTranscoded(to: UTF32.self)
129+
.lazy.map { UnicodeScalar($0) }
130+
}
131+
}
132+
75133
public extension _FixedFormatUnicode {
76134
var encoding: AnyUnicodeEncoding.Type {
77135
return encoding as Encoding.Type
78136
}
79137
}
80138

139+
public extension _FixedFormatUnicode where ExtendedASCIIView == CodeUnits {
140+
var extendedASCII: CodeUnits {
141+
return codeUnits
142+
}
143+
}
144+
81145
/// Default implementations
82146
public extension _FixedFormatUnicode {
83-
147+
84148
var isKnownLatin1: Bool { return false }
85149
var isKnownASCII: Bool { return false }
86150
var isKnownValidEncoding: Bool { return false }
@@ -126,7 +190,14 @@ public extension _FixedFormatUnicode where Encoding == Latin1 {
126190
}
127191

128192
public extension _FixedFormatUnicode
129-
where Encoding == Latin1, CodeUnits.Iterator.Element : UnsignedInteger {
193+
where Encoding == Latin1,
194+
CodeUnits.Iterator.Element == Encoding.EncodedScalar.Iterator.Element,
195+
CodeUnits.Iterator.Element : UnsignedInteger,
196+
CodeUnits.SubSequence : RandomAccessCollection,
197+
CodeUnits.SubSequence.Index == CodeUnits.Index,
198+
CodeUnits.SubSequence.SubSequence == CodeUnits.SubSequence,
199+
CodeUnits.SubSequence.Iterator.Element == CodeUnits.Iterator.Element
200+
{
130201
var rawUTF16: LazyMapRandomAccessCollection<CodeUnits, UInt16> {
131202
return fccNormalizedUTF16
132203
}
@@ -135,8 +206,20 @@ where Encoding == Latin1, CodeUnits.Iterator.Element : UnsignedInteger {
135206
var fccNormalizedUTF16: LazyMapRandomAccessCollection<CodeUnits, UInt16> {
136207
return codeUnits.lazy.map { numericCast($0) }
137208
}
138-
}
209+
210+
var characters: LazyMapRandomAccessCollection<CodeUnits, Character> {
211+
return codeUnits.lazy.map {
212+
Character(UnicodeScalar(_unchecked: numericCast($0)))
213+
}
214+
}
139215

216+
var unicodeScalars: LazyMapRandomAccessCollection<CodeUnits, UnicodeScalar> {
217+
return codeUnits.lazy.map {
218+
UnicodeScalar(_unchecked: numericCast($0))
219+
}
220+
}
221+
}
222+
140223
//===--- Defaults for UTF16 and ValidUTF16 --------------------------------===//
141224
public extension _FixedFormatUnicode
142225
where Encoding.EncodedScalar == UTF16.EncodedScalar,

stdlib/public/core/StringStorage.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,13 @@ extension _UTF16StringStorage : _FixedFormatUnicode {
139139
CodeUnits, Encoding
140140
>.FCCNormalizedUTF16View
141141

142+
public typealias CharacterView = UnicodeStorage<CodeUnits,Encoding>.CharacterView
143+
144+
public typealias UnicodeScalarView = LazyMapBidirectionalCollection<
145+
UnicodeStorage<CodeUnits,Encoding>.ScalarsTranscoded<UTF32>
146+
, UnicodeScalar
147+
>
148+
142149
public var encoding: UTF16.Type { return UTF16.self }
143150
public var codeUnits: _UTF16StringStorage { return self }
144151

@@ -469,6 +476,13 @@ extension _UTF8StringStorage : _FixedFormatUnicode {
469476
public typealias FCCNormalizedUTF16View = UnicodeStorage<
470477
CodeUnits, Encoding
471478
>.FCCNormalizedUTF16View
479+
480+
public typealias CharacterView = UnicodeStorage<CodeUnits,Encoding>.CharacterView
481+
482+
public typealias UnicodeScalarView = LazyMapBidirectionalCollection<
483+
UnicodeStorage<CodeUnits,Encoding>.ScalarsTranscoded<UTF32>
484+
, UnicodeScalar
485+
>
472486

473487
public var codeUnits: CodeUnits { return self }
474488
public var rawUTF16: RawUTF16View { return RawUTF16View(self) }

stdlib/public/core/UnicodeStorage.swift

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,18 @@ extension UnicodeStorage.EncodedScalars : BidirectionalCollection {
178178
}
179179

180180
extension UnicodeStorage {
181+
public typealias ScalarsTranscoded<ToEncoding : UnicodeEncoding>
182+
= LazyMapBidirectionalCollection<EncodedScalars, ToEncoding.EncodedScalar>
183+
184+
public func scalarsTranscoded<ToEncoding : UnicodeEncoding>(
185+
to dst: ToEncoding.Type
186+
)
187+
-> ScalarsTranscoded<ToEncoding> {
188+
return UnicodeStorage.EncodedScalars(codeUnits, Encoding.self).lazy.map {
189+
dst.encode($0)!
190+
}
191+
}
192+
181193
/// Given `CodeUnits` representing text that has been encoded with
182194
/// `FromEncoding`, provides a collection of `ToEncoding.CodeUnit`s
183195
/// representing the same text.
@@ -199,10 +211,7 @@ extension UnicodeStorage {
199211
from src: FromEncoding.Type = FromEncoding.self,
200212
to dst: ToEncoding.Type = ToEncoding.self
201213
) {
202-
base = Base(
203-
UnicodeStorage.EncodedScalars(codeUnits, src).lazy.map {
204-
dst.encode($0)!
205-
})
214+
base = Base(UnicodeStorage(codeUnits).scalarsTranscoded(to: dst))
206215
}
207216

208217
// FIXME: this should go in the extension below but for <rdar://30320012>

test/Prototypes/AnyUnicode.swift

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,91 @@ extension AnyUTF16.ZeroExtender : BidirectionalCollection, AnyUTF16_ {
209209
}
210210
}
211211

212+
protocol AnyUnicodeScalars_ {
213+
typealias IndexDistance = Int64
214+
typealias Index = UnicodeIndex
215+
typealias Element = UnicodeScalar
216+
var startIndex: Index { get }
217+
var endIndex: Index { get }
218+
func index(after: Index) -> Index
219+
func index(before: Index) -> Index
220+
subscript(i: Index) -> Element { get }
221+
222+
func withExistingUnsafeBuffer<R>(
223+
_ body: (UnsafeBufferPointer<Element>) throws -> R
224+
) rethrows -> R?
225+
}
226+
227+
struct AnyUnicodeScalars : BidirectionalCollection, AnyUnicodeScalars_ {
228+
let base: AnyUnicodeScalars_
229+
typealias IndexDistance = Int64
230+
typealias Index = UnicodeIndex
231+
typealias Element = AnyUnicodeScalars_.Element
232+
var startIndex: Index { return base.startIndex }
233+
var endIndex: Index { return base.endIndex }
234+
func index(after i: Index) -> Index { return base.index(after: i) }
235+
func index(before i: Index) -> Index { return base.index(before: i) }
236+
subscript(i: Index) -> Element { return base[i] }
237+
public func withExistingUnsafeBuffer<R>(
238+
_ body: (UnsafeBufferPointer<Element>) throws -> R
239+
) rethrows -> R? {
240+
return try base.withExistingUnsafeBuffer(body)
241+
}
242+
243+
init<C: BidirectionalCollection>(_ c: C)
244+
where C.Iterator.Element == UnicodeScalar {
245+
base = Adapter(base: c)
246+
}
247+
248+
struct Adapter<
249+
Base: BidirectionalCollection
250+
> where Base.Iterator.Element == UnicodeScalar {
251+
let base: Base
252+
}
253+
}
254+
255+
/// Adapts any bidirectional collection of unicode scalar values to
256+
/// AnyUnicodeScalars_
257+
extension AnyUnicodeScalars.Adapter
258+
: BidirectionalCollection, AnyUnicodeScalars_
259+
{
260+
typealias IndexDistance = Int64
261+
typealias Index = UnicodeIndex
262+
typealias Element = AnyUnicodeScalars_.Element
263+
264+
var startIndex: Index { return Index(offset: 0) }
265+
var endIndex: Index { return Index(offset: numericCast(base.count)) }
266+
267+
func index(after i: Index) -> Index {
268+
return Index(offset: numericCast(
269+
base.offset(of: base.index(after: base.index(atOffset: i.offset)))))
270+
}
271+
272+
func index(before i: Index) -> Index {
273+
return Index(offset: numericCast(
274+
base.offset(of: base.index(before: base.index(atOffset: i.offset)))))
275+
}
276+
277+
func index(_ i: Index, offsetBy n: Int64) -> Index {
278+
return Index(offset: numericCast(
279+
base.offset(
280+
of: base.index(base.index(atOffset: i.offset),
281+
offsetBy: numericCast(n)))))
282+
}
283+
284+
subscript(i: Index) -> Element {
285+
return base[base.index(atOffset: i.offset)]
286+
}
287+
288+
public func withExistingUnsafeBuffer<R>(
289+
_ body: (UnsafeBufferPointer<Element>) throws -> R
290+
) rethrows -> R? {
291+
return try base.withExistingUnsafeBuffer {
292+
try ($0 as Any as? UnsafeBufferPointer<Element>).map(body)
293+
}.flatMap { $0 }
294+
}
295+
}
296+
212297
protocol AnyRandomAccessUTF16_ : AnyUTF16_ {
213298
typealias IndexDistance = Int64
214299
typealias Index = UnicodeIndex
@@ -622,6 +707,27 @@ extension UTF16CompatibleStringContents : _FixedFormatUnicode {
622707
typealias CodeUnits = AnyRandomAccessUTF16
623708
typealias FCCNormalizedUTF16View = AnyUTF16
624709

710+
var characters: AnyCharacters {
711+
switch self {
712+
case .utf16(let storage):
713+
return storage.characters
714+
case .latin1(let storage):
715+
return storage.characters
716+
}
717+
}
718+
719+
var unicodeScalars: AnyCharacters {
720+
switch self {
721+
case .utf16(let storage):
722+
return storage.characters
723+
case .latin1(let storage):
724+
return storage.characters
725+
}
726+
}
727+
728+
//typealias CharacterView = LazyMapRandomAccessCollection<AnyRandomAccessUTF16, Character>
729+
// typealias UnicodeScalarView = LazyMapRandomAccessCollection<AnyRandomAccessUTF16, UnicodeScalar>
730+
625731
var rawUTF16: AnyUTF16 {
626732
switch self {
627733
case .utf16(let storage):

0 commit comments

Comments
 (0)