[examples/swift] fix multi-byte unicode character parsing

zshannon · zshannon · commit 1b3b907154ec · 2023-10-09T17:32:22.000-07:00
diff --git a/examples/swift/Sources/main.swift b/examples/swift/Sources/main.swift
@@ -64,7 +64,7 @@ if n_kv_req > n_ctx {
 
 var buffer: [CChar] = []
 for id: llama_token in tokens {
-    print(token_to_piece(token: id), terminator: "")
+    print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
 }
 
 print("\n")
@@ -101,6 +101,7 @@ if n_parallel > 1 {
 }
 
 var streams: [String] = .init(repeating: "", count: n_parallel)
+var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
 var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
 
 var n_cur = batch.n_tokens
@@ -157,12 +158,13 @@ while n_cur <= n_len {
             continue
         }
 
+        let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
+
         // if there is only one stream, we print immediately to stdout
         if n_parallel == 1 {
-            print(token_to_piece(token: new_token_id), terminator: "")
+            print(nextStringPiece, terminator: "")
         }
-
-        streams[i] += token_to_piece(token: new_token_id)
+        streams[i] += nextStringPiece
 
         // push this new token for next evaluation
         batch.token[Int(batch.n_tokens)] = new_token_id
@@ -216,11 +218,38 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
     return swiftTokens
 }
 
-private func token_to_piece(token: llama_token) -> String {
-    let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
-    result.initialize(repeating: Int8(0), count: 8)
-    let _ = llama_token_to_piece(model, token, result, 8)
-    let resultStr = String(cString: result)
-    result.deallocate()
-    return resultStr
+private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
+    var result = [CChar](repeating: 0, count: 8)
+    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
+    if nTokens < 0 {
+        if result.count >= -Int(nTokens) {
+            result.removeLast(-Int(nTokens))
+        } else {
+            result.removeAll()
+        }
+        let check = llama_token_to_piece(
+            model,
+            token,
+            &result,
+            Int32(result.count)
+        )
+        assert(check == nTokens)
+    } else {
+        result.removeLast(result.count - Int(nTokens))
+    }
+    if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
+        return utfString
+    } else {
+        buffer.append(contentsOf: result)
+        let data = Data(buffer.map { UInt8(bitPattern: $0) })
+        if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
+            buffer = []
+        }
+        guard let bufferString = String(data: data, encoding: .utf8) else {
+            return nil
+        }
+        buffer = []
+        return bufferString
+    }
+    return nil
 }