Skip to content

Enable string conversion in EUC-JP. #1296

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions Sources/FoundationEssentials/String/String+IO.swift
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ dynamic public func _cfMakeStringFromBytes(_ bytes: UnsafeBufferPointer<UInt8>,
// Provide swift-corelibs-foundation with an entry point to convert some bytes into a String
return nil
}

dynamic package func _icuMakeStringFromBytes(_ bytes: UnsafeBufferPointer<UInt8>, encoding: String.Encoding) -> String? {
// Concrete implementation is provided by FoundationInternationalization.
return nil
}
#endif

@available(macOS 10.10, iOS 8.0, watchOS 2.0, tvOS 9.0, *)
Expand Down Expand Up @@ -184,6 +189,17 @@ extension String {
}
}
self = bytes.withContiguousStorageIfAvailable(buildString) ?? Array(bytes).withUnsafeBufferPointer(buildString)
case .japaneseEUC:
// Here we catch encodings that are supported by Foundation Framework
// but are not supported by corelibs-foundation.
// We delegate conversion to ICU.
guard let string = (
bytes.withContiguousStorageIfAvailable({ _icuMakeStringFromBytes($0, encoding: encoding) }) ??
Array(bytes).withUnsafeBufferPointer({ _icuMakeStringFromBytes($0, encoding: encoding) })
) else {
return nil
}
self = string
#endif
default:
#if FOUNDATION_FRAMEWORK
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ dynamic public func _cfStringEncodingConvert(string: String, using encoding: UIn
// Dynamically replaced by swift-corelibs-foundation to implement encodings that we do not have Swift replacements for, yet
return nil
}

dynamic package func _icuStringEncodingConvert(string: String, using encoding: String.Encoding, allowLossyConversion: Bool) -> Data? {
// Concrete implementation is provided by FoundationInternationalization.
return nil
}
#endif

@available(FoundationPreview 0.4, *)
Expand Down Expand Up @@ -249,6 +254,11 @@ extension String {
buffer.appendElement(value)
}
}
case .japaneseEUC:
// Here we catch encodings that are supported by Foundation Framework
// but are not supported by corelibs-foundation.
// We delegate conversion to ICU.
return _icuStringEncodingConvert(string: self, using: encoding, allowLossyConversion: allowLossyConversion)
#endif
default:
#if FOUNDATION_FRAMEWORK
Expand Down
206 changes: 206 additions & 0 deletions Sources/FoundationInternationalization/ICU/ICU+StringConverter.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#if canImport(FoundationEssentials)
import FoundationEssentials
#endif
internal import _FoundationICU

private extension String.Encoding {
var _icuConverterName: String? {
// TODO: Replace this with forthcoming(?) public property such as https://github.com/swiftlang/swift-foundation/pull/1243
// Note: UTF-* and US-ASCII are omitted here because they are supposed to be converted upstream.
switch self {
case .japaneseEUC: "EUC-JP"
case .isoLatin1: "ISO-8859-1"
case .shiftJIS: "Shift_JIS"
case .isoLatin2: "ISO-8859-2"
case .windowsCP1251: "windows-1251"
case .windowsCP1252: "windows-1252"
case .windowsCP1253: "windows-1253"
case .windowsCP1254: "windows-1254"
case .windowsCP1250: "windows-1250"
case .iso2022JP: "ISO-2022-JP"
case .macOSRoman: "macintosh"
default: nil
}
}
}

extension ICU {
final class StringConverter: @unchecked Sendable {
private let _converter: LockedState<OpaquePointer> // UConverter*

let encoding: String.Encoding

init?(encoding: String.Encoding) {
guard let convName = encoding._icuConverterName else {
return nil
}
var status: UErrorCode = U_ZERO_ERROR
guard let converter = ucnv_open(convName, &status), status.isSuccess else {
return nil
}
self._converter = LockedState(initialState: converter)
self.encoding = encoding
}

deinit {
_converter.withLock { ucnv_close($0) }
}
}
}

extension ICU.StringConverter {
func decode(data: Data) -> String? {
return _converter.withLock { converter in
defer {
ucnv_resetToUnicode(converter)
}

let srcLength = CInt(data.count)
let initCapacity = srcLength * CInt(ucnv_getMinCharSize(converter)) + 1
return _withResizingUCharBuffer(initialSize: initCapacity) { (dest, capacity, status) in
return data.withUnsafeBytes { src in
ucnv_toUChars(
converter,
dest,
capacity,
src.baseAddress,
srcLength,
&status
)
}
}
}
}

func encode(string: String, allowLossyConversion lossy: Bool) -> Data? {
return _converter.withLock { (converter) -> Data? in
defer {
ucnv_resetFromUnicode(converter)
}

let utf16Rep = string.utf16
let uchars = UnsafeMutableBufferPointer<UChar>.allocate(capacity: utf16Rep.count)
_ = uchars.initialize(fromContentsOf: utf16Rep)
defer {
uchars.deallocate()
}

let srcLength = uchars.count
let capacity = srcLength * Int(ucnv_getMaxCharSize(converter)) + 1
let dest = UnsafeMutableRawPointer.allocate(
byteCount: capacity,
alignment: MemoryLayout<CChar>.alignment
)

var status: UErrorCode = U_ZERO_ERROR
if lossy {
var lossyChar: UChar = encoding == .ascii ? 0xFF : 0x3F
ucnv_setSubstString(
converter,
&lossyChar,
1,
&status
)
guard status.isSuccess else { return nil }

ucnv_setFromUCallBack(
converter,
UCNV_FROM_U_CALLBACK_SUBSTITUTE,
nil, // newContext
nil, // oldAction
nil, // oldContext
&status
)
guard status.isSuccess else { return nil }
} else {
ucnv_setFromUCallBack(
converter,
UCNV_FROM_U_CALLBACK_STOP,
nil, // newContext
nil, // oldAction
nil, // oldContext
&status
)
guard status.isSuccess else { return nil }
}

let actualLength = ucnv_fromUChars(
converter,
dest,
CInt(capacity),
uchars.baseAddress,
CInt(srcLength),
&status
)
guard status.isSuccess else { return nil }
return Data(
bytesNoCopy: dest,
count: Int(actualLength),
deallocator: .custom({ pointer, _ in pointer.deallocate() })
)
}
}
}

extension ICU.StringConverter {
private static let _converters: LockedState<[String.Encoding: ICU.StringConverter]> = .init(initialState: [:])

static func converter(for encoding: String.Encoding) -> ICU.StringConverter? {
return _converters.withLock {
if let converter = $0[encoding] {
return converter
}
if let converter = ICU.StringConverter(encoding: encoding) {
$0[encoding] = converter
return converter
}
return nil
}
}
}


#if !FOUNDATION_FRAMEWORK
@_dynamicReplacement(for: _icuMakeStringFromBytes(_:encoding:))
func _icuMakeStringFromBytes_impl(_ bytes: UnsafeBufferPointer<UInt8>, encoding: String.Encoding) -> String? {
guard let converter = ICU.StringConverter.converter(for: encoding),
let pointer = bytes.baseAddress else {
return nil
}

// Since we want to avoid unnecessary copy here,
// `bytes` is converted to `UnsafeMutableRawPointer`
// because `Data(bytesNoCopy:count:deallocator:)` accepts only that type.
// This operation is still safe,
// as the pointer is just borrowed (not escaped, not mutated)
// in `ICU.StringConverter.decode(data:) -> String?`.
// In addition to that, `Data` is useful here
// because it is `Sendable` (and has CoW behavior).
let data = Data(
bytesNoCopy: UnsafeMutableRawPointer(mutating: pointer),
count: bytes.count,
deallocator: .none
)
return converter.decode(data: data)
}

@_dynamicReplacement(for: _icuStringEncodingConvert(string:using:allowLossyConversion:))
func _icuStringEncodingConvert_impl(string: String, using encoding: String.Encoding, allowLossyConversion: Bool) -> Data? {
guard let converter = ICU.StringConverter.converter(for: encoding) else {
return nil
}
return converter.encode(string: string, allowLossyConversion: allowLossyConversion)
}
#endif
Loading
Loading