Skip to content

Commit 83a4d0f

Browse files
author
Dave Abrahams
committed
[stdlib] Create a UTF8 StringStorage
Add a CharacterView to _FixedFormatUnicode Implement RangeReplaceable inits to override defaults that would always assert for non-empty inputs (you can't grow a _BoundedBuffer beyond its capacity and the default capacity is zero).
1 parent 71f39fc commit 83a4d0f

File tree

4 files changed

+218
-7
lines changed

4 files changed

+218
-7
lines changed

stdlib/public/core/AnyUnicode.swift

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,26 @@ public protocol _FixedFormatUnicode : _AnyUnicode {
5050
// where Iterator.Element == UInt16
5151

5252
/// An FCC-normalized view of the string
53-
var fccNormalizedUTF16 : FCCNormalizedUTF16View { get }
53+
var fccNormalizedUTF16: FCCNormalizedUTF16View { get }
54+
55+
associatedtype CharacterView : BidirectionalCollection
56+
// where Iterator.Element == Character
57+
58+
var characters: CharacterView { get }
59+
}
60+
61+
/// Default views
62+
public extension _FixedFormatUnicode
63+
where
64+
CodeUnits.Iterator.Element == Encoding.EncodedScalar.Iterator.Element,
65+
CodeUnits.Iterator.Element : UnsignedInteger,
66+
CodeUnits.SubSequence : RandomAccessCollection,
67+
CodeUnits.SubSequence.Index == CodeUnits.Index,
68+
CodeUnits.SubSequence.SubSequence == CodeUnits.SubSequence,
69+
CodeUnits.SubSequence.Iterator.Element == CodeUnits.Iterator.Element {
70+
var characters: UnicodeStorage<CodeUnits,Encoding>.CharacterView {
71+
return UnicodeStorage(codeUnits).characters
72+
}
5473
}
5574

5675
public extension _FixedFormatUnicode {
@@ -108,12 +127,12 @@ public extension _FixedFormatUnicode where Encoding == Latin1 {
108127

109128
public extension _FixedFormatUnicode
110129
where Encoding == Latin1, CodeUnits.Iterator.Element : UnsignedInteger {
111-
var rawUTF16 : LazyMapRandomAccessCollection<CodeUnits, UInt16> {
130+
var rawUTF16: LazyMapRandomAccessCollection<CodeUnits, UInt16> {
112131
return fccNormalizedUTF16
113132
}
114133

115134
/// An FCC-normalized view of the string
116-
var fccNormalizedUTF16 : LazyMapRandomAccessCollection<CodeUnits, UInt16> {
135+
var fccNormalizedUTF16: LazyMapRandomAccessCollection<CodeUnits, UInt16> {
117136
return codeUnits.lazy.map { numericCast($0) }
118137
}
119138
}

stdlib/public/core/BoundedBufferReference.swift

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,19 @@ extension _BoundedBufferReference {
117117

118118
/// Fulfills the RangeReplaceableCollection requirements
119119
extension _BoundedBufferReference {
120+
public init<S : Sequence>(_ elements: S)
121+
where S.Iterator.Element == Iterator.Element {
122+
self.init(Array(elements))
123+
}
124+
125+
public init<C : Collection>(_ elements: C)
126+
where C.Iterator.Element == Iterator.Element {
127+
self.init(_uninitializedCount: numericCast(elements.count))
128+
withUnsafeMutableBufferPointer {
129+
elements._copyCompleteContents(initializing: $0)
130+
}
131+
}
132+
120133
public func replaceSubrange<C>(
121134
_ target: Range<Int>,
122135
with newValues: C

stdlib/public/core/StringStorage.swift

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,3 +379,101 @@ extension _Latin1StringStorage : _FixedFormatUnicode {
379379
}
380380
}
381381

382+
//===--- UTF-8 String Storage ---------------------------------------------===//
383+
public struct UTF8StringHeader : _BoundedBufferHeader {
384+
public var count: UInt32
385+
public var capacity: UInt32
386+
387+
public init(count: Int, capacity: Int) {
388+
self.count = numericCast(count)
389+
self.capacity = numericCast(capacity)
390+
}
391+
}
392+
393+
public final class _UTF8StringStorage
394+
// FIXME: we might want our own header type
395+
: _StringStorageBase<UTF8StringHeader, UInt8>
396+
, _NSStringCore // Ensures that we implement essential NSString methods.
397+
{
398+
// WORKAROUND: helping type inference along will be unnecessary someday
399+
public typealias _Element = UInt8
400+
public typealias Iterator = IndexingIterator<_UTF8StringStorage>
401+
402+
/// Returns a pointer to contiguously-stored UTF-16 code units
403+
/// comprising the whole string, or NULL if such storage isn't
404+
/// available.
405+
///
406+
/// WARNING: don't use this method from Swift code; ARC may end the
407+
/// lifetime of self before you get a chance to use the result.
408+
@objc
409+
public func _fastCharacterContents() -> UnsafeMutablePointer<UInt16>? {
410+
return nil
411+
}
412+
413+
/// Returns a pointer to contiguously-stored code units in the
414+
/// system encoding comprising the whole string, or NULL if such
415+
/// storage isn't available.
416+
///
417+
// WARNING: don't use this method from Swift code; ARC may end the lifetime of
418+
// self before you get a chance to use the result.
419+
// WARNING: Before you implement this as anything other than “return nil,”
420+
// see https://github.com/apple/swift/pull/3151#issuecomment-285583557
421+
@objc
422+
public func _fastCStringContents(
423+
_ nullTerminationRequired: Int8
424+
) -> UnsafePointer<CChar>? {
425+
return nil
426+
}
427+
428+
// WORKAROUND: rdar://31047127 prevents us from hoisting this into
429+
// _StringStorageBase
430+
@nonobjc
431+
public override var _baseAddress: UnsafeMutablePointer<UTF8.CodeUnit> {
432+
return UnsafeMutablePointer(
433+
Builtin.projectTailElems(self, Element.self))
434+
}
435+
436+
@nonobjc
437+
public var isKnownASCII = false
438+
@nonobjc
439+
public var isKnownLatin1 = false
440+
@nonobjc
441+
public var isKnownValidEncoding = false
442+
@nonobjc
443+
public var isKnownFCCNormalized = false
444+
@nonobjc
445+
public var isKnownFCDForm = false
446+
@nonobjc
447+
public var isKnownNFDNormalized = false
448+
@nonobjc
449+
public var isKnownNFCNormalized = false
450+
}
451+
452+
extension _UTF8StringStorage : _BoundedBufferReference {
453+
@nonobjc
454+
public static func _emptyInstance() -> _UTF8StringStorage {
455+
return _UTF8StringStorage(uninitializedWithMinimumCapacity: 0)
456+
}
457+
}
458+
459+
extension _UTF8StringStorage : _FixedFormatUnicode {
460+
public typealias Encoding = UTF8
461+
462+
// WORKAROUND: helping type inference along will be unnecessary someday
463+
public typealias CodeUnits = _UTF8StringStorage
464+
465+
public typealias RawUTF16View = UnicodeStorage<
466+
CodeUnits, Encoding
467+
>.TranscodedView<UTF16>
468+
469+
public typealias FCCNormalizedUTF16View = UnicodeStorage<
470+
CodeUnits, Encoding
471+
>.FCCNormalizedUTF16View
472+
473+
public var codeUnits: CodeUnits { return self }
474+
public var rawUTF16: RawUTF16View { return RawUTF16View(self) }
475+
public var fccNormalizedUTF16: FCCNormalizedUTF16View {
476+
return UnicodeStorage(self, UTF8.self).fccNormalizedUTF16
477+
}
478+
}
479+

test/Prototypes/AnyUnicode.swift

Lines changed: 85 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,55 @@ extension AnyUnicode {
517517
}
518518
}
519519

520+
// Work around name collision ambiguity
521+
extension _FixedFormatUnicode {
522+
internal var _codeUnits: CodeUnits { return codeUnits }
523+
internal var _characters: CharacterView { return characters }
524+
internal var _fccNormalizedUTF16: FCCNormalizedUTF16View {
525+
return fccNormalizedUTF16
526+
}
527+
}
528+
529+
extension AnyUnicode
530+
where Self : _FixedFormatUnicode,
531+
Self.CodeUnits : RandomAccessCollection,
532+
Self.CodeUnits.Iterator.Element : UnsignedInteger,
533+
Self.RawUTF16View : BidirectionalCollection,
534+
Self.RawUTF16View.Iterator.Element == UTF16.CodeUnit,
535+
Self.FCCNormalizedUTF16View.Iterator.Element : UnsignedInteger,
536+
Self.CodeUnits.Index == Self.CodeUnits.SubSequence.Index,
537+
Self.CodeUnits.SubSequence : RandomAccessCollection,
538+
Self.CodeUnits.SubSequence == Self.CodeUnits.SubSequence.SubSequence,
539+
Self.CodeUnits.Iterator.Element == Self.CodeUnits.SubSequence.Iterator.Element,
540+
Self.CodeUnits.SubSequence.Iterator.Element == Self.Encoding.EncodedScalar.Iterator.Element,
541+
Self.CharacterView.Iterator.Element == Character,
542+
Self.CharacterView.Index : SignedInteger
543+
{
544+
var codeUnits: AnyCodeUnits {
545+
return AnyCodeUnits(self._codeUnits)
546+
}
547+
548+
var rawUTF16: AnyUTF16 { return AnyUTF16(self.rawUTF16 as RawUTF16View) }
549+
// FIXME: this could be more efficient for encodings such as Latin1
550+
var utf32: AnyUnicodeBidirectionalUInt32 {
551+
return AnyUnicodeBidirectionalUInt32(
552+
UnicodeStorage(
553+
_codeUnits, Encoding.self
554+
).transcoded(to: UTF32.self)
555+
)
556+
}
557+
var fccNormalizedUTF16: AnyUTF16 {
558+
return AnyUTF16(fccNormalizedUTF16 as FCCNormalizedUTF16View)
559+
}
560+
// FIXME: Could be more efficient generally
561+
var extendedASCII: AnyUnicodeBidirectionalUInt32 {
562+
return utf32
563+
}
564+
var characters: AnyCharacters {
565+
return AnyCharacters(_characters)
566+
}
567+
}
568+
520569
struct AnyRandomAccessUnsignedIntegers<
521570
Base: RandomAccessCollection, Element_ : UnsignedInteger
522571
> : RandomAccessCollection
@@ -585,9 +634,9 @@ extension UTF16CompatibleStringContents : _FixedFormatUnicode {
585634
var fccNormalizedUTF16: FCCNormalizedUTF16View {
586635
switch self {
587636
case .utf16(let storage):
588-
return AnyUTF16(storage.fccNormalizedUTF16)
637+
return storage.fccNormalizedUTF16
589638
case .latin1(let storage):
590-
return AnyUTF16(storage.fccNormalizedUTF16)
639+
return storage.fccNormalizedUTF16
591640
}
592641
}
593642

@@ -728,6 +777,14 @@ case latin1(_Latin1StringStorage)
728777
case any(AnyUnicodeBox)
729778
}
730779

780+
extension _UTF16StringStorage : AnyUnicode {
781+
782+
}
783+
784+
extension _Latin1StringStorage : AnyUnicode {
785+
786+
}
787+
731788
extension AnyStringContents : AnyUnicode {
732789
var encoding: AnyUnicodeEncoding.Type {
733790
switch self {
@@ -788,9 +845,9 @@ extension AnyStringContents : AnyUnicode {
788845
var fccNormalizedUTF16: AnyUTF16 {
789846
switch self {
790847
case .utf16(let storage):
791-
return AnyUTF16(storage.fccNormalizedUTF16)
848+
return storage.fccNormalizedUTF16
792849
case .latin1(let storage):
793-
return AnyUTF16(storage.fccNormalizedUTF16)
850+
return storage.fccNormalizedUTF16
794851
case .any(let base):
795852
return base.fccNormalizedUTF16
796853
}
@@ -883,6 +940,18 @@ extension AnyStringContents : AnyUnicode {
883940
return base.isKnownNFCNormalized
884941
}
885942
}
943+
944+
init<T: AnyUnicode>(_ x: T) {
945+
if let s = x as? _Latin1StringStorage {
946+
self = .latin1(s)
947+
}
948+
else if let s = x as? _UTF16StringStorage {
949+
self = .utf16(s)
950+
}
951+
else {
952+
self = .any(AnyUnicodeBox(wrapping: x))
953+
}
954+
}
886955
}
887956

888957
print(MemoryLayout<UTF16CompatibleStringContents>.size)
@@ -895,4 +964,16 @@ suite.test("basics") {
895964
expectTrue(x.elementsEqual(y))
896965
}
897966

967+
suite.test("AnyStringContents") {
968+
let sample = "abcdefghijklmnopqrstuvwxyz\n"
969+
+ "🇸🇸🇬🇱🇱🇸🇩🇯🇺🇸\n"
970+
+ "Σὲ 👥🥓γνωρίζω ἀπὸ τὴν κόψη χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!\n"
971+
+ "Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,\n"
972+
+ "გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო\n"
973+
+ "Зарегистрируйтесь сейчас на Десятую Международную Конференцию по\n"
974+
+ " ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่\n"
975+
+ "ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ"
976+
977+
var s = AnyStringContents(_UTF16StringStorage(sample.utf16))
978+
}
898979
runAllTests()

0 commit comments

Comments
 (0)