From 31427770b16b382ce78183895cacb4b399a51833 Mon Sep 17 00:00:00 2001 From: MarkBase Admin Date: Sun, 5 Jul 2026 13:41:48 +0800 Subject: [PATCH] v2: Apply tokenizer UTF-8 fix + Engine writeFloats helper - Tokenizer fix: collect <0xXX> bytes and decode as UTF-8 (fixes Chinese/non-ASCII character decoding) - BPETokenizer + HuggingFaceTokenizer: both updated - Engine.swift: added writeFloats() utility method - FloatWeights struct added to Layer.swift (bf16 support) - attnQBits/KBits/VBits/OBits detection added to Model.swift - bf16 layer weight support from commit 48c0347 cherry-picked --- Sources/MarkBase/Engine.swift | 7 +++++++ Sources/MarkBase/Layers/Layer.swift | 8 ++++++++ Sources/MarkBase/Model.swift | 6 +++++- Sources/MarkBase/Tokenizer/BPETokenizer.swift | 11 ++++++++++- .../MarkBase/Tokenizer/HuggingFaceTokenizer.swift | 13 ++++++++++--- 5 files changed, 40 insertions(+), 5 deletions(-) diff --git a/Sources/MarkBase/Engine.swift b/Sources/MarkBase/Engine.swift index bf6a76c..fb60715 100644 --- a/Sources/MarkBase/Engine.swift +++ b/Sources/MarkBase/Engine.swift @@ -286,4 +286,11 @@ public final class MarkBaseEngine: @unchecked Sendable { let ptr = buffer.contents().assumingMemoryBound(to: Float.self) return Array(UnsafeBufferPointer(start: ptr + offset, count: count)) } + + public func writeFloats(to buffer: MTLBuffer, values: [Float], offset: Int = 0) { + let ptr = buffer.contents().assumingMemoryBound(to: Float.self) + for i in 0.. String { + var bytes: [UInt8] = [] var result = "" var i = text.startIndex @@ -215,7 +216,7 @@ public final class BPETokenizer: Tokenizer, @unchecked Sendable { let hexStr = String(text[hexStart.." { i = text.index(after: afterHex) @@ -228,10 +229,18 @@ public final class BPETokenizer: Tokenizer, @unchecked Sendable { } } + if !bytes.isEmpty { + result += String(bytes: bytes, encoding: .utf8) ?? "" + bytes.removeAll() + } result.append(text[i]) i = text.index(after: i) } + if !bytes.isEmpty { + result += String(bytes: bytes, encoding: .utf8) ?? "" + } + return result } } diff --git a/Sources/MarkBase/Tokenizer/HuggingFaceTokenizer.swift b/Sources/MarkBase/Tokenizer/HuggingFaceTokenizer.swift index 92df4ba..ecab69d 100644 --- a/Sources/MarkBase/Tokenizer/HuggingFaceTokenizer.swift +++ b/Sources/MarkBase/Tokenizer/HuggingFaceTokenizer.swift @@ -268,11 +268,11 @@ public final class HuggingFaceTokenizer: Tokenizer { /// Decode <0xXX> byte tokens back to characters private func decodeByteTokens(_ text: String) -> String { + var bytes: [UInt8] = [] var result = "" var i = text.startIndex while i < text.endIndex { - // Check for <0xXX> pattern if text[i] == "<" { let nextIndex = text.index(after: i) if nextIndex < text.endIndex && text[nextIndex] == "0" { @@ -283,8 +283,7 @@ public final class HuggingFaceTokenizer: Tokenizer { let hexStr = String(text[hexStart.. + bytes.append(byte) let afterHex = text.index(after: hexEnd) if afterHex < text.endIndex && text[afterHex] == ">" { i = text.index(after: afterHex) @@ -297,10 +296,18 @@ public final class HuggingFaceTokenizer: Tokenizer { } } + if !bytes.isEmpty { + result += String(bytes: bytes, encoding: .utf8) ?? "" + bytes.removeAll() + } result.append(text[i]) i = text.index(after: i) } + if !bytes.isEmpty { + result += String(bytes: bytes, encoding: .utf8) ?? "" + } + return result } }