Skip to content

Enable string conversion in EUC-JP. #1296

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions Sources/FoundationEssentials/String/String+IO.swift
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ dynamic public func _cfMakeStringFromBytes(_ bytes: UnsafeBufferPointer<UInt8>,
// Provide swift-corelibs-foundation with an entry point to convert some bytes into a String
return nil
}

dynamic package func _icuMakeStringFromBytes(_ bytes: UnsafeBufferPointer<UInt8>, encoding: String.Encoding) -> String? {
// Concrete implementation is provided by FoundationInternationalization.
return nil
}
#endif

@available(macOS 10.10, iOS 8.0, watchOS 2.0, tvOS 9.0, *)
Expand Down Expand Up @@ -184,6 +189,17 @@ extension String {
}
}
self = bytes.withContiguousStorageIfAvailable(buildString) ?? Array(bytes).withUnsafeBufferPointer(buildString)
case .japaneseEUC:
// Here we catch encodings that are supported by Foundation Framework
// but are not supported by corelibs-foundation.
// We delegate conversion to ICU.
guard let string = (
bytes.withContiguousStorageIfAvailable({ _icuMakeStringFromBytes($0, encoding: encoding) }) ??
Array(bytes).withUnsafeBufferPointer({ _icuMakeStringFromBytes($0, encoding: encoding) })
) else {
return nil
}
self = string
#endif
default:
#if FOUNDATION_FRAMEWORK
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ dynamic public func _cfStringEncodingConvert(string: String, using encoding: UIn
// Dynamically replaced by swift-corelibs-foundation to implement encodings that we do not have Swift replacements for, yet
return nil
}

dynamic package func _icuStringEncodingConvert(string: String, using encoding: String.Encoding, allowLossyConversion: Bool) -> Data? {
// Concrete implementation is provided by FoundationInternationalization.
return nil
}
#endif

@available(FoundationPreview 0.4, *)
Expand Down Expand Up @@ -249,6 +254,11 @@ extension String {
buffer.appendElement(value)
}
}
case .japaneseEUC:
// Here we catch encodings that are supported by Foundation Framework
// but are not supported by corelibs-foundation.
// We delegate conversion to ICU.
return _icuStringEncodingConvert(string: self, using: encoding, allowLossyConversion: allowLossyConversion)
#endif
default:
#if FOUNDATION_FRAMEWORK
Expand Down
204 changes: 204 additions & 0 deletions Sources/FoundationInternationalization/ICU/ICU+StringConverter.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#if canImport(FoundationEssentials)
import FoundationEssentials
#endif
internal import _FoundationICU

private extension String.Encoding {
var _icuConverterName: String? {
// TODO: Replace this with forthcoming(?) public property such as https://github.com/swiftlang/swift-foundation/pull/1243
// Note: UTF-* and US-ASCII are omitted here because they are supposed to be converted upstream.
switch self {
case .japaneseEUC: "EUC-JP"
case .isoLatin1: "ISO-8859-1"
case .shiftJIS: "Shift_JIS"
case .isoLatin2: "ISO-8859-2"
case .windowsCP1251: "windows-1251"
case .windowsCP1252: "windows-1252"
case .windowsCP1253: "windows-1253"
case .windowsCP1254: "windows-1254"
case .windowsCP1250: "windows-1250"
case .iso2022JP: "ISO-2022-JP"
case .macOSRoman: "macintosh"
default: nil
}
}
}

extension ICU {
final class StringConverter: @unchecked Sendable {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: does this need the @unchecked? I would have assumed that since it's a final class with all immutable, Sendable properties that the compiler can validate this conformance

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you.
That's a vestige of my first implementation where _converter was a bare pointer...

Copy link
Member Author

@YOCKOW YOCKOW May 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry.
@unchecked is necessary still because LockedState<OpaquePointer> is not Sendable...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah LockedState is Sendable but it requires its stored value to be of a Sendable type, and I had mistakenly thought OpaquePointer was Sendable but it is indeed not Sendable which is why the Sendable conformance isn't available there. This @unchecked Sendable makes sense then, since despite the pointer and anything it stores never escapes the lock.

private let _converter: LockedState<OpaquePointer> // UConverter*

let encoding: String.Encoding

init?(encoding: String.Encoding) {
guard let convName = encoding._icuConverterName else {
return nil
}
var status: UErrorCode = U_ZERO_ERROR
guard let converter = ucnv_open(convName, &status), status.isSuccess else {
return nil
}
self._converter = LockedState(initialState: converter)
self.encoding = encoding
}

deinit {
_converter.withLock { ucnv_close($0) }
}
}
}

extension ICU.StringConverter {
func decode(data: Data) -> String? {
return _converter.withLock { converter in
defer {
ucnv_resetToUnicode(converter)
}

let srcLength = CInt(data.count)
let initCapacity = srcLength * CInt(ucnv_getMinCharSize(converter)) + 1
return _withResizingUCharBuffer(initialSize: initCapacity) { (dest, capacity, status) in
return data.withUnsafeBytes { src in
ucnv_toUChars(
converter,
dest,
capacity,
src.baseAddress,
srcLength,
&status
)
}
}
}
}

func encode(string: String, allowLossyConversion lossy: Bool) -> Data? {
return _converter.withLock { (converter) -> Data? in
defer {
ucnv_resetFromUnicode(converter)
}

let utf16Rep = string.utf16
let uchars = UnsafeMutableBufferPointer<UChar>.allocate(capacity: utf16Rep.count)
_ = uchars.initialize(fromContentsOf: utf16Rep)
defer {
uchars.deallocate()
}

let srcLength = uchars.count
let capacity = srcLength * Int(ucnv_getMaxCharSize(converter)) + 1
let dest = UnsafeMutableRawPointer.allocate(
byteCount: capacity,
alignment: MemoryLayout<CChar>.alignment
)

var status: UErrorCode = U_ZERO_ERROR
if lossy {
var lossyChar: UChar = encoding == .ascii ? 0xFF : 0x3F
ucnv_setSubstString(
converter,
&lossyChar,
1,
&status
)
guard status.isSuccess else { return nil }

ucnv_setFromUCallBack(
converter,
UCNV_FROM_U_CALLBACK_SUBSTITUTE,
nil, // newContext
nil, // oldAction
nil, // oldContext
&status
)
guard status.isSuccess else { return nil }
} else {
ucnv_setFromUCallBack(
converter,
UCNV_FROM_U_CALLBACK_STOP,
nil, // newContext
nil, // oldAction
nil, // oldContext
&status
)
guard status.isSuccess else { return nil }
}

let actualLength = ucnv_fromUChars(
converter,
dest,
CInt(capacity),
uchars.baseAddress,
CInt(srcLength),
&status
)
guard status.isSuccess else { return nil }
return Data(
bytesNoCopy: dest,
count: Int(actualLength),
deallocator: .custom({ pointer, _ in pointer.deallocate() })
)
}
}
}

extension ICU.StringConverter {
private static let _converters: LockedState<[String.Encoding: ICU.StringConverter]> = .init(initialState: [:])

static func converter(for encoding: String.Encoding) -> ICU.StringConverter? {
return _converters.withLock {
if let converter = $0[encoding] {
return converter
}
if let converter = ICU.StringConverter(encoding: encoding) {
$0[encoding] = converter
return converter
}
return nil
}
}
}


@_dynamicReplacement(for: _icuMakeStringFromBytes(_:encoding:))
func _icuMakeStringFromBytes_impl(_ bytes: UnsafeBufferPointer<UInt8>, encoding: String.Encoding) -> String? {
guard let converter = ICU.StringConverter.converter(for: encoding),
let pointer = bytes.baseAddress else {
return nil
}

// Since we want to avoid unnecessary copy here,
// `bytes` is converted to `UnsafeMutableRawPointer`
// because `Data(bytesNoCopy:count:deallocator:)` accepts only that type.
// This operation is still safe,
// as the pointer is just borrowed (not escaped, not mutated)
// in `ICU.StringConverter.decode(data:) -> String?`.
// In addition to that, `Data` is useful here
// because it is `Sendable` (and has CoW behavior).
let data = Data(
bytesNoCopy: UnsafeMutableRawPointer(mutating: pointer),
count: bytes.count,
deallocator: .none
)
return converter.decode(data: data)
}

@_dynamicReplacement(for: _icuStringEncodingConvert(string:using:allowLossyConversion:))
func _icuStringEncodingConvert_impl(string: String, using encoding: String.Encoding, allowLossyConversion: Bool) -> Data? {
guard let converter = ICU.StringConverter.converter(for: encoding) else {
return nil
}
return converter.encode(string: string, allowLossyConversion: allowLossyConversion)
}
126 changes: 126 additions & 0 deletions Tests/FoundationInternationalizationTests/StringTests+Data.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#if FOUNDATION_FRAMEWORK
@testable import Foundation
#else
@testable import FoundationEssentials
@testable import FoundationInternationalization
#endif // FOUNDATION_FRAMEWORK

#if canImport(TestSupport)
import TestSupport
#endif

final class StringConverterTests: XCTestCase {
private func _test_roundTripConversion(
string: String,
data: Data,
encoding: String._Encoding,
file: StaticString = #filePath,
line: UInt = #line
) {
XCTAssertEqual(
string.data(using: encoding), data, "Failed to convert string to data.",
file: file, line: line
)
XCTAssertEqual(
string, String(data: data, encoding: encoding), "Failed to convert data to string.",
file: file, line: line
)
}

func test_japaneseEUC() {
// Confirm that https://github.com/swiftlang/swift-foundation/issues/1016 is fixed.

// ASCII
_test_roundTripConversion(
string: "ABC",
data: Data([0x41, 0x42, 0x43]),
encoding: .japaneseEUC
)

// Plane 1 Row 1
_test_roundTripConversion(
string: "、。◇",
data: Data([
0xA1, 0xA2,
0xA1, 0xA3,
0xA1, 0xFE,
]),
encoding: .japaneseEUC
)

// Plane 1 Row 4 (Hiragana)
_test_roundTripConversion(
string: "ひらがな",
data: Data([
0xA4, 0xD2,
0xA4, 0xE9,
0xA4, 0xAC,
0xA4, 0xCA,
]),
encoding: .japaneseEUC
)

// Plane 1 Row 5 (Katakana)
_test_roundTripConversion(
string: "ヴヵヶ",
data: Data([
0xA5, 0xF4,
0xA5, 0xF5,
0xA5, 0xF6,
]),
encoding: .japaneseEUC
)

// Plane 1 Row 6 (Greek Alphabets)
_test_roundTripConversion(
string: "Σπ",
data: Data([
0xA6, 0xB2,
0xA6, 0xD0,
]),
encoding: .japaneseEUC
)

// Basic Kanji
_test_roundTripConversion(
string: "日本",
data: Data([
0xC6, 0xFC,
0xCB, 0xDC,
]),
encoding: .japaneseEUC
)

// Amendment by JIS83/JIS90
_test_roundTripConversion(
string: "扉⇔穴",
data: Data([
0xC8, 0xE2,
0xA2, 0xCE,
0xB7, 0xEA,
]),
encoding: .japaneseEUC
)

// Unsupported characters
let sushi = "Sushi🍣"
XCTAssertNil(sushi.data(using: String._Encoding.japaneseEUC))
XCTAssertEqual(
sushi.data(using: String._Encoding.japaneseEUC, allowLossyConversion: true),
"Sushi?".data(using: .utf8)
)
}
}