Files
markbaseengine/Tests/00_Unit/TokenizerTest.swift
T
MarkBase Admin 8a66b9086a
CI / build (push) Waiting to run
CI / unit-tests (push) Blocked by required conditions
CI / lint (push) Blocked by required conditions
v2: Initial clean branch with unit tests + CI/CD pipeline
- Started from ac75faa (initial E4B-MarkBase integration)
- Kept Sources/ (all engine code) + Package.swift + .gitignore
- Removed all ad-hoc tests, documentation, scripts, Python files
- Added Tests/00_Unit/ (MathTest, TokenizerTest, SamplerTest)
- Added .gitea/workflows/ci.yaml (build + unit tests + lint)
- Added Scripts/check_resources.sh (memory-aware test runner)
- Added Tests/Manifest.json (resource requirements for all tests)
- Focus: 4-bit quantized models only
2026-07-05 13:29:25 +08:00

58 lines
2.0 KiB
Swift

import XCTest
@testable import MarkBase
final class TokenizerTest: XCTestCase {
var tokenizer: Tokenizing!
override func setUp() {
super.setUp()
let modelDir = "/Users/accusys/MarkBaseEngine/models/E4B-MarkBase"
guard FileManager.default.fileExists(atPath: modelDir) else {
return
}
tokenizer = try? TokenizerFactory.load(modelDir: modelDir)
}
func testTokenizerAvailable() {
let modelDir = "/Users/accusys/MarkBaseEngine/models/E4B-MarkBase"
guard FileManager.default.fileExists(atPath: modelDir) else {
throw XCTSkip("E4B-MarkBase model not found")
}
XCTAssertNotNil(tokenizer, "Tokenizer should load successfully")
}
func testEncodeDecodeRoundtrip() throws {
try XCTSkipIf(tokenizer == nil, "Tokenizer not available")
let inputs = ["Hello", "Hello World", "test", "123", "你好"]
for input in inputs {
let tokens = tokenizer.encode(text: input)
let decoded = tokenizer.decode(tokens: tokens)
XCTAssertEqual(decoded.lowercased(), input.lowercased(),
"Roundtrip failed for '\(input)': got '\(decoded)'")
}
}
func testSpacePreservation() throws {
try XCTSkipIf(tokenizer == nil, "Tokenizer not available")
let input = "Hello World"
let tokens = tokenizer.encode(text: input)
let decoded = tokenizer.decode(tokens: tokens)
XCTAssertTrue(decoded.contains(" "), "Spaces should be preserved in '\(decoded)'")
}
func testSpecialTokens() throws {
try XCTSkipIf(tokenizer == nil, "Tokenizer not available")
// BOS token should exist
let bosToken = tokenizer.encode(text: "")
XCTAssertFalse(bosToken.isEmpty, "BOS token should be prepended")
}
func testEmptyString() throws {
try XCTSkipIf(tokenizer == nil, "Tokenizer not available")
let tokens = tokenizer.encode(text: "")
let decoded = tokenizer.decode(tokens: tokens)
XCTAssertNotNil(decoded)
}
}