diff --git a/Sources/FoundationEssentials/String/String+IO.swift b/Sources/FoundationEssentials/String/String+IO.swift index 55f8e19ae..640efe422 100644 --- a/Sources/FoundationEssentials/String/String+IO.swift +++ b/Sources/FoundationEssentials/String/String+IO.swift @@ -24,6 +24,11 @@ dynamic public func _cfMakeStringFromBytes(_ bytes: UnsafeBufferPointer, // Provide swift-corelibs-foundation with an entry point to convert some bytes into a String return nil } + +dynamic package func _icuMakeStringFromBytes(_ bytes: UnsafeBufferPointer, encoding: String.Encoding) -> String? { + // Concrete implementation is provided by FoundationInternationalization. + return nil +} #endif @available(macOS 10.10, iOS 8.0, watchOS 2.0, tvOS 9.0, *) @@ -184,6 +189,17 @@ extension String { } } self = bytes.withContiguousStorageIfAvailable(buildString) ?? Array(bytes).withUnsafeBufferPointer(buildString) + case .japaneseEUC: + // Here we catch encodings that are supported by Foundation Framework + // but are not supported by corelibs-foundation. + // We delegate conversion to ICU. + guard let string = ( + bytes.withContiguousStorageIfAvailable({ _icuMakeStringFromBytes($0, encoding: encoding) }) ?? + Array(bytes).withUnsafeBufferPointer({ _icuMakeStringFromBytes($0, encoding: encoding) }) + ) else { + return nil + } + self = string #endif default: #if FOUNDATION_FRAMEWORK diff --git a/Sources/FoundationEssentials/String/StringProtocol+Essentials.swift b/Sources/FoundationEssentials/String/StringProtocol+Essentials.swift index 4c99c1599..1447385e0 100644 --- a/Sources/FoundationEssentials/String/StringProtocol+Essentials.swift +++ b/Sources/FoundationEssentials/String/StringProtocol+Essentials.swift @@ -91,6 +91,11 @@ dynamic public func _cfStringEncodingConvert(string: String, using encoding: UIn // Dynamically replaced by swift-corelibs-foundation to implement encodings that we do not have Swift replacements for, yet return nil } + +dynamic package func _icuStringEncodingConvert(string: String, using encoding: String.Encoding, allowLossyConversion: Bool) -> Data? { + // Concrete implementation is provided by FoundationInternationalization. + return nil +} #endif @available(FoundationPreview 0.4, *) @@ -249,6 +254,11 @@ extension String { buffer.appendElement(value) } } + case .japaneseEUC: + // Here we catch encodings that are supported by Foundation Framework + // but are not supported by corelibs-foundation. + // We delegate conversion to ICU. + return _icuStringEncodingConvert(string: self, using: encoding, allowLossyConversion: allowLossyConversion) #endif default: #if FOUNDATION_FRAMEWORK diff --git a/Sources/FoundationInternationalization/ICU/ICU+StringConverter.swift b/Sources/FoundationInternationalization/ICU/ICU+StringConverter.swift new file mode 100644 index 000000000..0ad69aecc --- /dev/null +++ b/Sources/FoundationInternationalization/ICU/ICU+StringConverter.swift @@ -0,0 +1,204 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +#if canImport(FoundationEssentials) +import FoundationEssentials +#endif +internal import _FoundationICU + +private extension String.Encoding { + var _icuConverterName: String? { + // TODO: Replace this with forthcoming(?) public property such as https://github.com/swiftlang/swift-foundation/pull/1243 + // Note: UTF-* and US-ASCII are omitted here because they are supposed to be converted upstream. + switch self { + case .japaneseEUC: "EUC-JP" + case .isoLatin1: "ISO-8859-1" + case .shiftJIS: "Shift_JIS" + case .isoLatin2: "ISO-8859-2" + case .windowsCP1251: "windows-1251" + case .windowsCP1252: "windows-1252" + case .windowsCP1253: "windows-1253" + case .windowsCP1254: "windows-1254" + case .windowsCP1250: "windows-1250" + case .iso2022JP: "ISO-2022-JP" + case .macOSRoman: "macintosh" + default: nil + } + } +} + +extension ICU { + final class StringConverter: @unchecked Sendable { + private let _converter: LockedState // UConverter* + + let encoding: String.Encoding + + init?(encoding: String.Encoding) { + guard let convName = encoding._icuConverterName else { + return nil + } + var status: UErrorCode = U_ZERO_ERROR + guard let converter = ucnv_open(convName, &status), status.isSuccess else { + return nil + } + self._converter = LockedState(initialState: converter) + self.encoding = encoding + } + + deinit { + _converter.withLock { ucnv_close($0) } + } + } +} + +extension ICU.StringConverter { + func decode(data: Data) -> String? { + return _converter.withLock { converter in + defer { + ucnv_resetToUnicode(converter) + } + + let srcLength = CInt(data.count) + let initCapacity = srcLength * CInt(ucnv_getMinCharSize(converter)) + 1 + return _withResizingUCharBuffer(initialSize: initCapacity) { (dest, capacity, status) in + return data.withUnsafeBytes { src in + ucnv_toUChars( + converter, + dest, + capacity, + src.baseAddress, + srcLength, + &status + ) + } + } + } + } + + func encode(string: String, allowLossyConversion lossy: Bool) -> Data? { + return _converter.withLock { (converter) -> Data? in + defer { + ucnv_resetFromUnicode(converter) + } + + let utf16Rep = string.utf16 + let uchars = UnsafeMutableBufferPointer.allocate(capacity: utf16Rep.count) + _ = uchars.initialize(fromContentsOf: utf16Rep) + defer { + uchars.deallocate() + } + + let srcLength = uchars.count + let capacity = srcLength * Int(ucnv_getMaxCharSize(converter)) + 1 + let dest = UnsafeMutableRawPointer.allocate( + byteCount: capacity, + alignment: MemoryLayout.alignment + ) + + var status: UErrorCode = U_ZERO_ERROR + if lossy { + var lossyChar: UChar = encoding == .ascii ? 0xFF : 0x3F + ucnv_setSubstString( + converter, + &lossyChar, + 1, + &status + ) + guard status.isSuccess else { return nil } + + ucnv_setFromUCallBack( + converter, + UCNV_FROM_U_CALLBACK_SUBSTITUTE, + nil, // newContext + nil, // oldAction + nil, // oldContext + &status + ) + guard status.isSuccess else { return nil } + } else { + ucnv_setFromUCallBack( + converter, + UCNV_FROM_U_CALLBACK_STOP, + nil, // newContext + nil, // oldAction + nil, // oldContext + &status + ) + guard status.isSuccess else { return nil } + } + + let actualLength = ucnv_fromUChars( + converter, + dest, + CInt(capacity), + uchars.baseAddress, + CInt(srcLength), + &status + ) + guard status.isSuccess else { return nil } + return Data( + bytesNoCopy: dest, + count: Int(actualLength), + deallocator: .custom({ pointer, _ in pointer.deallocate() }) + ) + } + } +} + +extension ICU.StringConverter { + private static let _converters: LockedState<[String.Encoding: ICU.StringConverter]> = .init(initialState: [:]) + + static func converter(for encoding: String.Encoding) -> ICU.StringConverter? { + return _converters.withLock { + if let converter = $0[encoding] { + return converter + } + if let converter = ICU.StringConverter(encoding: encoding) { + $0[encoding] = converter + return converter + } + return nil + } + } +} + + +@_dynamicReplacement(for: _icuMakeStringFromBytes(_:encoding:)) +func _icuMakeStringFromBytes_impl(_ bytes: UnsafeBufferPointer, encoding: String.Encoding) -> String? { + guard let converter = ICU.StringConverter.converter(for: encoding), + let pointer = bytes.baseAddress else { + return nil + } + + // Since we want to avoid unnecessary copy here, + // `bytes` is converted to `UnsafeMutableRawPointer` + // because `Data(bytesNoCopy:count:deallocator:)` accepts only that type. + // This operation is still safe, + // as the pointer is just borrowed (not escaped, not mutated) + // in `ICU.StringConverter.decode(data:) -> String?`. + // In addition to that, `Data` is useful here + // because it is `Sendable` (and has CoW behavior). + let data = Data( + bytesNoCopy: UnsafeMutableRawPointer(mutating: pointer), + count: bytes.count, + deallocator: .none + ) + return converter.decode(data: data) +} + +@_dynamicReplacement(for: _icuStringEncodingConvert(string:using:allowLossyConversion:)) +func _icuStringEncodingConvert_impl(string: String, using encoding: String.Encoding, allowLossyConversion: Bool) -> Data? { + guard let converter = ICU.StringConverter.converter(for: encoding) else { + return nil + } + return converter.encode(string: string, allowLossyConversion: allowLossyConversion) +} diff --git a/Tests/FoundationInternationalizationTests/StringTests+Data.swift b/Tests/FoundationInternationalizationTests/StringTests+Data.swift new file mode 100644 index 000000000..7c0babc26 --- /dev/null +++ b/Tests/FoundationInternationalizationTests/StringTests+Data.swift @@ -0,0 +1,126 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +#if FOUNDATION_FRAMEWORK +@testable import Foundation +#else +@testable import FoundationEssentials +@testable import FoundationInternationalization +#endif // FOUNDATION_FRAMEWORK + +#if canImport(TestSupport) +import TestSupport +#endif + +final class StringConverterTests: XCTestCase { + private func _test_roundTripConversion( + string: String, + data: Data, + encoding: String._Encoding, + file: StaticString = #filePath, + line: UInt = #line + ) { + XCTAssertEqual( + string.data(using: encoding), data, "Failed to convert string to data.", + file: file, line: line + ) + XCTAssertEqual( + string, String(data: data, encoding: encoding), "Failed to convert data to string.", + file: file, line: line + ) + } + + func test_japaneseEUC() { + // Confirm that https://github.com/swiftlang/swift-foundation/issues/1016 is fixed. + + // ASCII + _test_roundTripConversion( + string: "ABC", + data: Data([0x41, 0x42, 0x43]), + encoding: .japaneseEUC + ) + + // Plane 1 Row 1 + _test_roundTripConversion( + string: "、。◇", + data: Data([ + 0xA1, 0xA2, + 0xA1, 0xA3, + 0xA1, 0xFE, + ]), + encoding: .japaneseEUC + ) + + // Plane 1 Row 4 (Hiragana) + _test_roundTripConversion( + string: "ひらがな", + data: Data([ + 0xA4, 0xD2, + 0xA4, 0xE9, + 0xA4, 0xAC, + 0xA4, 0xCA, + ]), + encoding: .japaneseEUC + ) + + // Plane 1 Row 5 (Katakana) + _test_roundTripConversion( + string: "ヴヵヶ", + data: Data([ + 0xA5, 0xF4, + 0xA5, 0xF5, + 0xA5, 0xF6, + ]), + encoding: .japaneseEUC + ) + + // Plane 1 Row 6 (Greek Alphabets) + _test_roundTripConversion( + string: "Σπ", + data: Data([ + 0xA6, 0xB2, + 0xA6, 0xD0, + ]), + encoding: .japaneseEUC + ) + + // Basic Kanji + _test_roundTripConversion( + string: "日本", + data: Data([ + 0xC6, 0xFC, + 0xCB, 0xDC, + ]), + encoding: .japaneseEUC + ) + + // Amendment by JIS83/JIS90 + _test_roundTripConversion( + string: "扉⇔穴", + data: Data([ + 0xC8, 0xE2, + 0xA2, 0xCE, + 0xB7, 0xEA, + ]), + encoding: .japaneseEUC + ) + + // Unsupported characters + let sushi = "Sushi🍣" + XCTAssertNil(sushi.data(using: String._Encoding.japaneseEUC)) + XCTAssertEqual( + sushi.data(using: String._Encoding.japaneseEUC, allowLossyConversion: true), + "Sushi?".data(using: .utf8) + ) + } +} +