Compare commits
12 Commits
97798850e3
...
v2
| Author | SHA1 | Date | |
|---|---|---|---|
| ba4c41c29f | |||
| 96fe213bc4 | |||
| 97f9bdcf90 | |||
| 16c16b9bee | |||
| 7e686c3c5a | |||
| af1d10737e | |||
| 07459e8ee3 | |||
| 7a8edf77ee | |||
| 239474bef0 | |||
| 8a29dae613 | |||
| 2fd03d0ac1 | |||
| e9ab994533 |
@@ -25,7 +25,7 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Run Unit Tests
|
||||
run: swift test --filter "MathTest" --filter "SamplerTest" --filter "TokenizerTest"
|
||||
run: swift test --filter "MathTest" --filter "SamplerTest" --filter "TokenizerTest" --filter "ModelTest"
|
||||
|
||||
lint:
|
||||
needs: build
|
||||
|
||||
+3
-1
@@ -7,4 +7,6 @@ Package.resolved
|
||||
*.xcodeproj/
|
||||
*.xcworkspace/
|
||||
.DS_Store
|
||||
test_summary.md
|
||||
blobs/
|
||||
test_summary.md.runner
|
||||
.runner
|
||||
|
||||
@@ -161,12 +161,6 @@ extension E4BModel {
|
||||
cmdBuf: cmdBuf
|
||||
)
|
||||
|
||||
// Logits scaling
|
||||
if embedWeight.groupSize == 32 && embedWeight.inDim == hiddenSize {
|
||||
let logitsScale = Float(30.0 / 116.23 / sqrt(Float(hiddenSize)))
|
||||
try scaleBufferOptimized(logitsBuffer, scale: logitsScale, count: vocabSize, cmdBuf: cmdBuf)
|
||||
}
|
||||
|
||||
// Softcapping
|
||||
if let cap = finalLogitSoftcapping {
|
||||
try applyLogitSoftcappingOptimized(
|
||||
|
||||
@@ -160,26 +160,6 @@ embedCmdBuf.waitUntilCompleted()
|
||||
encLM.dispatchThreads(gridLM, threadsPerThreadgroup: tgLM)
|
||||
encLM.endEncoding()
|
||||
|
||||
// Logits scaling and softcapping (batch)
|
||||
if embedWeight.groupSize == 32 {
|
||||
let logitsScale = Float(30.0 / 116.23 / sqrt(Float(hiddenSize)))
|
||||
// Use eltwise_scale for batch scaling
|
||||
let pso = try engine.pipeline(named: "eltwise_scale")
|
||||
let enc = layerCmdBuf.makeComputeCommandEncoder()!
|
||||
enc.setComputePipelineState(pso)
|
||||
|
||||
enc.setBuffer(context.batchOutputBuffer, offset: 0, index: 0)
|
||||
var ls = logitsScale
|
||||
enc.setBytes(&ls, length: 4, index: 1)
|
||||
var total = UInt32(batchSize * vocabSize)
|
||||
enc.setBytes(&total, length: 4, index: 2)
|
||||
|
||||
let tg = MTLSize(width: 256, height: 1, depth: 1)
|
||||
let grid = MTLSize(width: batchSize * vocabSize, height: 1, depth: 1)
|
||||
enc.dispatchThreads(grid, threadsPerThreadgroup: tg)
|
||||
enc.endEncoding()
|
||||
}
|
||||
|
||||
// Softcapping (skip if kernel not found)
|
||||
if let cap = finalLogitSoftcapping {
|
||||
// Try to use tanh_scale kernel
|
||||
|
||||
@@ -366,9 +366,8 @@ func quantizedMatmul(engine: MarkBaseEngine, cmdBuf: MTLCommandBuffer,
|
||||
weights: QuantizedWeights,
|
||||
output: MTLBuffer) throws {
|
||||
// Select kernel based on quantization bits
|
||||
let kernelName = weights.bits == 8 ? "quantized_matmul_8bit" : "quantized_matmul"
|
||||
// TEMPORARILY USE FALLBACK KERNEL FOR TESTING
|
||||
if false, let pso = try? engine.pipeline(named: kernelName) {
|
||||
let kernelName = weights.bits == 8 ? "quantized_matmul_simd_8bit" : "quantized_matmul"
|
||||
if let pso = try? engine.pipeline(named: kernelName) {
|
||||
let enc = cmdBuf.makeComputeCommandEncoder()!
|
||||
enc.setComputePipelineState(pso)
|
||||
enc.setBuffer(input, offset: 0, index: 0)
|
||||
@@ -868,7 +867,7 @@ func quantizedMatmulExpert(engine: MarkBaseEngine, cmdBuf: MTLCommandBuffer,
|
||||
enc.setBytes(&inDim, length: MemoryLayout<UInt32>.size, index: 5)
|
||||
var outDim = UInt32(expert.expertOutDim)
|
||||
enc.setBytes(&outDim, length: MemoryLayout<UInt32>.size, index: 6)
|
||||
var groupSize = UInt32(expert.expertInDim / 64)
|
||||
var groupSize = UInt32(expert.expertInDim / expert.numGroups)
|
||||
enc.setBytes(&groupSize, length: MemoryLayout<UInt32>.size, index: 7)
|
||||
let tg = engine.threadgroupSize1D(fallbackPSO, count: expert.expertOutDim)
|
||||
enc.dispatchThreads(MTLSize(width: expert.expertOutDim, height: 1, depth: 1),
|
||||
@@ -922,7 +921,7 @@ func quantizedMatmulExpert(engine: MarkBaseEngine, cmdBuf: MTLCommandBuffer,
|
||||
enc.setBytes(&inDim, length: MemoryLayout<UInt32>.size, index: 8)
|
||||
var outDim = UInt32(gate.expertOutDim)
|
||||
enc.setBytes(&outDim, length: MemoryLayout<UInt32>.size, index: 9)
|
||||
var groupSize = UInt32(gate.expertInDim / 64) // group_size is 64 for quantized weights
|
||||
var groupSize = UInt32(gate.expertInDim / gate.numGroups)
|
||||
enc.setBytes(&groupSize, length: MemoryLayout<UInt32>.size, index: 10)
|
||||
let count = gate.expertOutDim
|
||||
let tg = engine.threadgroupSize1D(pso, count: count)
|
||||
@@ -977,6 +976,10 @@ func quantizedMatmulExpert(engine: MarkBaseEngine, cmdBuf: MTLCommandBuffer,
|
||||
gate: MoEExpertGroup, up: MoEExpertGroup, down: MoEExpertGroup,
|
||||
accum: MTLBuffer) throws -> Bool {
|
||||
guard let pso = try? engine.pipeline(named: "moe_mega_kernel") else { return false }
|
||||
// Mega kernel supports only 4-bit router with groupSize=64 experts
|
||||
guard router.bits == 4 else { return false }
|
||||
let expertGroupSize = gate.expertInDim / gate.numGroups
|
||||
guard expertGroupSize == 64 else { return false }
|
||||
let enc = cmdBuf.makeComputeCommandEncoder()!
|
||||
enc.setComputePipelineState(pso)
|
||||
enc.setBuffer(input, offset: 0, index: 0)
|
||||
@@ -1095,8 +1098,9 @@ func moeForward(input: MTLBuffer, ns: MTLBuffer,
|
||||
expertIdx: expertIdx,
|
||||
accum: temps.h, weight: weight)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// ── Step 5: Residual: input += moe_output (temps.h) scaled by layerScalar ──
|
||||
if layerScalar != 1.0 {
|
||||
try eltwiseAddScaled(engine: engine, cmdBuf: cmdBuf,
|
||||
|
||||
@@ -343,8 +343,8 @@ kernel void quantized_matmul_simd(
|
||||
uint packedBase = outRow * (inDim / 8) + g * (groupSize / 8);
|
||||
uint xBase = g * groupSize;
|
||||
|
||||
// Process 4 uint32 per iteration (32 nibbles) — half the loop count
|
||||
for (uint p = 0; p < 8; p += 4) {
|
||||
// Process 4 uint32 per iteration (32 nibbles) — half the loop count
|
||||
for (uint p = 0; p < groupSize / 8; p += 4) {
|
||||
// Vectorized uint4 load (reduces load instructions)
|
||||
device uint4 *packedPtr = (device uint4*)(&w[packedBase + p]);
|
||||
uint4 packed = *packedPtr;
|
||||
@@ -510,7 +510,7 @@ kernel void quantized_matmul_gate_up_down(
|
||||
uint wBase = gid * packedPerIn + g * (groupSize / 8);
|
||||
uint xBase = g * groupSize;
|
||||
|
||||
for (uint p = 0; p < 8; p += 4) {
|
||||
for (uint p = 0; p < groupSize / 8; p += 4) {
|
||||
device uint4 *gPtr = (device uint4*)(&w_gate[wBase + p]);
|
||||
device uint4 *uPtr = (device uint4*)(&w_up[wBase + p]);
|
||||
uint4 gP = *gPtr;
|
||||
@@ -588,7 +588,7 @@ kernel void quantized_matmul_gate_up_down(
|
||||
uint wBase = gid * packedPerOut + g * (groupSize / 8);
|
||||
uint iBase = g * groupSize;
|
||||
|
||||
for (uint p = 0; p < 8; p += 4) {
|
||||
for (uint p = 0; p < groupSize / 8; p += 4) {
|
||||
device uint4 *wPtr = (device uint4*)(&w_down[wBase + p]);
|
||||
uint4 packed = *wPtr;
|
||||
|
||||
@@ -1123,7 +1123,7 @@ kernel void quantized_matmul_gate_up_opt(
|
||||
uint wBase = gid * packedPerOut + g * (groupSize / 8);
|
||||
uint xBase = g * groupSize;
|
||||
|
||||
for (uint p = 0; p < 8; p += 4) {
|
||||
for (uint p = 0; p < groupSize / 8; p += 4) {
|
||||
device uint4 *gPtr = (device uint4*)(&w_gate[wBase + p]);
|
||||
device uint4 *uPtr = (device uint4*)(&w_up[wBase + p]);
|
||||
uint4 gP = *gPtr;
|
||||
|
||||
@@ -291,30 +291,7 @@ readers = readersDict
|
||||
// Handle optional missing scales/biases (non-quantized embedding)
|
||||
if let eg = embedGroup {
|
||||
print(" ✓ embed_tokens loaded")
|
||||
// Check if scales need normalization for custom quantization
|
||||
// For groupSize=32 models, scales are ~3000x larger than standard
|
||||
// Need to divide by hiddenSize to get correct values
|
||||
if eg.groupSize == 32 && eg.inDim == hiddenSize {
|
||||
print(" ⚠ Detected groupSize=32 custom quantization, normalizing scales...")
|
||||
let scaleCorrection = Float(hiddenSize)
|
||||
let pso = try engine.pipeline(named: "eltwise_scale")
|
||||
let cmdBuf = engine.commandQueue.makeCommandBuffer()!
|
||||
let enc = cmdBuf.makeComputeCommandEncoder()!
|
||||
enc.setComputePipelineState(pso)
|
||||
enc.setBuffer(eg.scales, offset: 0, index: 0)
|
||||
var s = 1.0 / scaleCorrection
|
||||
enc.setBytes(&s, length: MemoryLayout<Float>.size, index: 1)
|
||||
let count = eg.scales.length / MemoryLayout<Float>.stride
|
||||
var N = UInt32(count)
|
||||
enc.setBytes(&N, length: MemoryLayout<UInt32>.size, index: 2)
|
||||
let tg = engine.threadgroupSize1D(pso, count: count)
|
||||
enc.dispatchThreads(MTLSize(width: count, height: 1, depth: 1),
|
||||
threadsPerThreadgroup: tg)
|
||||
enc.endEncoding()
|
||||
cmdBuf.commit()
|
||||
cmdBuf.waitUntilCompleted()
|
||||
print(" ✓ Scales normalized (divided by \(scaleCorrection))")
|
||||
}
|
||||
// Note: groupSize=32 scale normalization now done in quantizedGroup
|
||||
self.embedWeight = eg
|
||||
} else {
|
||||
// Non-quantized: create dummy quantized wrapper (all 0 scales=1.0, biases=0.0)
|
||||
@@ -547,19 +524,31 @@ readers = readersDict
|
||||
let sName = "\(fullName).scales"
|
||||
let bName = "\(fullName).biases"
|
||||
|
||||
if let wData = preloadedDataCache[wName], let sData = preloadedDataCache[sName] {
|
||||
let bData = preloadedDataCache[bName]
|
||||
if let wData = preloadedDataCache[wName], let sData = preloadedDataCache[sName], fullName.contains("embed") == false {
|
||||
let wDesc = allTensors.first(where: { $0.name == wName })
|
||||
let sDesc = allTensors.first(where: { $0.name == sName })
|
||||
|
||||
let wShape = wDesc?.shape ?? []
|
||||
let sShape = sDesc?.shape ?? []
|
||||
let outDim = wShape.count > 0 ? wShape[0] : 0
|
||||
let packedDim = wShape.count > 1 ? wShape[1] : 0
|
||||
let inDim = packedDim * (bits == 4 ? 8 : 4)
|
||||
let groupSize = (sShape.count > 1 && sShape[1] > 0) ? inDim / sShape[1] : 64
|
||||
|
||||
let bData = preloadedDataCache[bName]
|
||||
|
||||
let wBuf = wData.withUnsafeBytes { ptr in
|
||||
engine.device.makeBuffer(bytes: ptr.baseAddress!, length: wData.count, options: .storageModeShared)
|
||||
}
|
||||
|
||||
// Convert scales from BF16 to Float32 (safetensors stores as BF16)
|
||||
let sBuf: MTLBuffer?
|
||||
if sDesc?.dtype == .bf16 {
|
||||
let sFloats = SafeTensorsReader.bf16ToFloat32(sData)
|
||||
var sFloats = SafeTensorsReader.bf16ToFloat32(sData)
|
||||
if groupSize == 32 {
|
||||
for i in 0..<sFloats.count {
|
||||
sFloats[i] = sFloats[i] / Float(inDim)
|
||||
}
|
||||
}
|
||||
sBuf = engine.device.makeBuffer(
|
||||
bytes: sFloats, length: sFloats.count * MemoryLayout<Float>.stride,
|
||||
options: .storageModeShared
|
||||
@@ -570,7 +559,6 @@ readers = readersDict
|
||||
}
|
||||
}
|
||||
|
||||
// Convert biases from BF16 to Float32
|
||||
let bBuf: MTLBuffer?
|
||||
if let bData = bData {
|
||||
if let bDesc = allTensors.first(where: { $0.name == bName }), bDesc.dtype == .bf16 {
|
||||
@@ -585,7 +573,6 @@ readers = readersDict
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No bias data, create zero biases with same count as scales
|
||||
let sCount = sDesc?.shape.reduce(1, *) ?? 0
|
||||
let bFloatsZero = [Float](repeating: 0.0, count: sCount)
|
||||
bBuf = engine.device.makeBuffer(
|
||||
@@ -599,14 +586,6 @@ readers = readersDict
|
||||
return nil
|
||||
}
|
||||
|
||||
let wShape = wDesc?.shape ?? []
|
||||
let sShape = sDesc?.shape ?? []
|
||||
|
||||
let outDim = wShape[0]
|
||||
let packedDim = wShape[1]
|
||||
let inDim = packedDim * (bits == 4 ? 8 : 4)
|
||||
let groupSize = (sShape.count > 1 && sShape[1] > 0) ? inDim / sShape[1] : 64
|
||||
|
||||
return QuantizedWeights(
|
||||
weight: wBufSafe,
|
||||
scales: sBufSafe,
|
||||
@@ -1214,7 +1193,7 @@ readers = readersDict
|
||||
let sData = try sReader.read(tensor: sDesc)
|
||||
let bData = bReader != nil && bDesc != nil ? try bReader!.read(tensor: bDesc!) : nil
|
||||
|
||||
let sFloats = SafeTensorsReader.bf16ToFloat32(sData)
|
||||
var sFloats = SafeTensorsReader.bf16ToFloat32(sData)
|
||||
let bFloats = bData != nil ? SafeTensorsReader.bf16ToFloat32(bData!) : nil
|
||||
|
||||
let outDim = wDesc.shape[0]
|
||||
@@ -1226,10 +1205,19 @@ readers = readersDict
|
||||
let numGroups = sDesc.shape[1]
|
||||
let groupSize = inDim / numGroups
|
||||
|
||||
// Normalize scales for groupSize=32 custom quantization
|
||||
// These models store scales inflated by hiddenSize factor
|
||||
if groupSize == 32 {
|
||||
for i in 0..<sFloats.count {
|
||||
sFloats[i] = sFloats[i] / Float(inDim)
|
||||
}
|
||||
}
|
||||
|
||||
guard let wBuf = device.makeBuffer(
|
||||
bytes: (wData as NSData).bytes, length: wData.count,
|
||||
options: .storageModeShared
|
||||
) else { return nil }
|
||||
|
||||
guard let sBuf = device.makeBuffer(
|
||||
bytes: sFloats, length: sFloats.count * MemoryLayout<Float>.stride,
|
||||
options: .storageModeShared
|
||||
@@ -1397,8 +1385,9 @@ readers = readersDict
|
||||
|
||||
// Scales: [numExperts, expertOutDim, numGroups] bf16
|
||||
// Biases: same shape as scales
|
||||
let groupSize = 64
|
||||
let numGroups = expertInDim / groupSize
|
||||
let numGroups = sDesc.shape.count > 2 ? sDesc.shape[2] : expertInDim / 64
|
||||
|
||||
let expertGroupSize = expertInDim / numGroups
|
||||
|
||||
// Get readers
|
||||
let wReader: SafeTensorsReader
|
||||
@@ -1427,9 +1416,16 @@ readers = readersDict
|
||||
let bDesc = bReader != nil ? findTensor(bName, in: tensors) : nil
|
||||
let bData: Data? = bDesc != nil ? try bReader!.read(tensor: bDesc!) : nil
|
||||
|
||||
let sFloats = SafeTensorsReader.bf16ToFloat32(sData)
|
||||
var sFloats = SafeTensorsReader.bf16ToFloat32(sData)
|
||||
let bFloats = bData != nil ? SafeTensorsReader.bf16ToFloat32(bData!) : nil
|
||||
|
||||
|
||||
// Normalize scales for groupSize=32 custom quantization
|
||||
if expertGroupSize == 32 {
|
||||
for i in 0..<sFloats.count {
|
||||
sFloats[i] = sFloats[i] / Float(expertInDim)
|
||||
}
|
||||
}
|
||||
|
||||
let valsPerU32 = 32 / bits
|
||||
let inDimPacked = expertInDim / valsPerU32
|
||||
|
||||
@@ -1446,7 +1442,7 @@ readers = readersDict
|
||||
bytes: (wData as NSData).bytes, length: wData.count,
|
||||
options: .storageModeShared
|
||||
) else { return nil }
|
||||
|
||||
|
||||
guard let sBuf = device.makeBuffer(
|
||||
bytes: sFloats, length: sFloats.count * MemoryLayout<Float>.stride,
|
||||
options: .storageModeShared
|
||||
@@ -1698,17 +1694,8 @@ readers = readersDict
|
||||
|
||||
// ── 5b. Logits scaling for custom quantization (groupSize=32) ──
|
||||
// For groupSize=32 models, logits are ~200x larger than standard
|
||||
// Need to scale by ~0.00486 to normalize to E4B-like range
|
||||
if embedWeight.groupSize == 32 && embedWeight.inDim == hiddenSize {
|
||||
// Total scaling: 1/sqrt(hidden_size) * (30/116) ≈ 0.00486
|
||||
// This brings logits to similar range as E4B
|
||||
let logitsScale = Float(30.0 / 116.23 / sqrt(Float(hiddenSize)))
|
||||
if position == 0 {
|
||||
print(" ⚠ Scaling logits by \(logitsScale) for groupSize=32 custom quantization")
|
||||
fflush(stdout)
|
||||
}
|
||||
try scaleBuffer(logitsBuffer, scale: logitsScale, count: vocabSize)
|
||||
}
|
||||
// NOTE: groupSize=32 scale normalization now done in quantizedGroup/loadExpertGroup
|
||||
// No additional logit scaling needed here
|
||||
|
||||
// ── 6. Logit softcapping ──
|
||||
if let cap = finalLogitSoftcapping {
|
||||
|
||||
@@ -110,12 +110,6 @@ extension E4BModel {
|
||||
try quantizedMatmulOptimized(input: lmInput, weights: embedWeight,
|
||||
output: logitsBuffer, cmdBuf: cmdBuf3)
|
||||
|
||||
// Logits scaling (if needed)
|
||||
if embedWeight.groupSize == 32 && embedWeight.inDim == hiddenSize {
|
||||
let logitsScale = Float(30.0 / 116.23 / sqrt(Float(hiddenSize)))
|
||||
try scaleBufferOptimized(logitsBuffer, scale: logitsScale, count: vocabSize, cmdBuf: cmdBuf3)
|
||||
}
|
||||
|
||||
// Logit softcapping
|
||||
if let cap = finalLogitSoftcapping {
|
||||
try applyLogitSoftcappingOptimized(buffer: logitsBuffer, cap: cap,
|
||||
|
||||
@@ -77,6 +77,8 @@ public final class VisionTower {
|
||||
enc.setBytes(&inD, length: MemoryLayout<UInt32>.size, index: 5)
|
||||
var outD = UInt32(weights.outDim)
|
||||
enc.setBytes(&outD, length: MemoryLayout<UInt32>.size, index: 6)
|
||||
var groupSize = UInt32(weights.groupSize)
|
||||
enc.setBytes(&groupSize, length: MemoryLayout<UInt32>.size, index: 7)
|
||||
|
||||
let grid = MTLSize(width: weights.outDim * seqLen, height: 1, depth: 1)
|
||||
let tg = engine.threadgroupSize1D(pso, count: max(weights.outDim, seqLen))
|
||||
|
||||
@@ -236,7 +236,7 @@ public final class VisionTower12B {
|
||||
output: MTLBuffer,
|
||||
cmdBuf: MTLCommandBuffer
|
||||
) throws {
|
||||
let pso = try engine.pipeline(named: "quantized_matmul")
|
||||
let pso = try engine.pipeline(named: "quantized_matmul_seq")
|
||||
let enc = cmdBuf.makeComputeCommandEncoder()!
|
||||
enc.setComputePipelineState(pso)
|
||||
|
||||
@@ -244,22 +244,22 @@ public final class VisionTower12B {
|
||||
enc.setBuffer(weight, offset: 0, index: 1)
|
||||
enc.setBuffer(scales, offset: 0, index: 2)
|
||||
enc.setBuffer(biases, offset: 0, index: 3)
|
||||
enc.setBuffer(output, offset: 0, index: 4)
|
||||
enc.setBuffer(bias ?? biases, offset: 0, index: 4)
|
||||
enc.setBuffer(output, offset: 0, index: 5)
|
||||
|
||||
var inD = UInt32(inDim)
|
||||
enc.setBytes(&inD, length: MemoryLayout<UInt32>.size, index: 5)
|
||||
enc.setBytes(&inD, length: 4, index: 6)
|
||||
var outD = UInt32(outDim)
|
||||
enc.setBytes(&outD, length: MemoryLayout<UInt32>.size, index: 6)
|
||||
enc.setBytes(&outD, length: 4, index: 7)
|
||||
var hasBias = bias != nil
|
||||
enc.setBytes(&hasBias, length: 1, index: 8)
|
||||
var sl = UInt32(seqLen)
|
||||
enc.setBytes(&sl, length: 4, index: 9)
|
||||
|
||||
let grid = MTLSize(width: outDim * seqLen, height: 1, depth: 1)
|
||||
let tg = engine.threadgroupSize1D(pso, count: max(outDim, seqLen))
|
||||
let grid = MTLSize(width: outDim, height: seqLen, depth: 1)
|
||||
let tg = engine.threadgroupSize2D(pso, grid: (outDim, seqLen))
|
||||
enc.dispatchThreads(grid, threadsPerThreadgroup: tg)
|
||||
enc.endEncoding()
|
||||
|
||||
// Add unquantized bias if present
|
||||
if let b = bias {
|
||||
try eltwiseAdd(input: output, bias: b, seqLen: seqLen, dim: outDim, cmdBuf: cmdBuf)
|
||||
}
|
||||
}
|
||||
|
||||
private func rmsNormSeq(
|
||||
|
||||
@@ -0,0 +1,150 @@
|
||||
import XCTest
|
||||
@testable import MarkBase
|
||||
|
||||
final class LongContext12BTest: XCTestCase {
|
||||
|
||||
var engine: MarkBaseEngine!
|
||||
var model: E4BModel!
|
||||
let modelDir = "/Users/accusys/MarkBaseEngine/models/gemma-4-12b-it-4bit"
|
||||
let maxCtx = 2048
|
||||
|
||||
override func setUp() {
|
||||
super.setUp()
|
||||
guard FileManager.default.fileExists(atPath: modelDir + "/model.safetensors.index.json") else {
|
||||
return
|
||||
}
|
||||
engine = try? MarkBaseEngine(autoCompile: true)
|
||||
model = try? E4BModel(modelDir: modelDir, engine: engine, maxContextLength: maxCtx)
|
||||
}
|
||||
|
||||
func testLongContext256Tokens() throws {
|
||||
try XCTSkipIf(model == nil, "12B model not found")
|
||||
|
||||
let promptLength = 256
|
||||
var tokens = [Int]()
|
||||
for i in 0..<promptLength {
|
||||
tokens.append(100 + (i % 1000))
|
||||
}
|
||||
|
||||
for (pos, tokenId) in tokens.enumerated() {
|
||||
let logits = try model.forward(tokenId: tokenId, position: pos)
|
||||
if pos == 0 || pos == promptLength - 1 {
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "NaN at pos=\(pos)")
|
||||
}
|
||||
if pos % 64 == 0 {
|
||||
let sample = logits.prefix(5)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
print(" pos=\(pos): logits[0..5]=\(sample) NaN=\(nanCount)")
|
||||
}
|
||||
}
|
||||
|
||||
var genTokens = tokens
|
||||
for i in 0..<5 {
|
||||
let logits = try model.forward(tokenId: genTokens.last ?? 0, position: genTokens.count - 1)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "NaN at gen step \(i)")
|
||||
var maxIdx = 0
|
||||
var maxVal = logits[0]
|
||||
for j in 1..<logits.count {
|
||||
if logits[j] > maxVal { maxVal = logits[j]; maxIdx = j }
|
||||
}
|
||||
genTokens.append(maxIdx)
|
||||
print(" gen[\(i)]: token=\(maxIdx) logit=\(maxVal)")
|
||||
}
|
||||
}
|
||||
|
||||
func testFullContext2048Tokens() throws {
|
||||
try XCTSkipIf(model == nil, "12B model not found")
|
||||
|
||||
let promptLength = maxCtx
|
||||
var tokens = [Int]()
|
||||
for i in 0..<promptLength {
|
||||
tokens.append(100 + (i % 1000))
|
||||
}
|
||||
|
||||
var lastLogits: [Float]?
|
||||
for (pos, tokenId) in tokens.enumerated() {
|
||||
let logits = try model.forward(tokenId: tokenId, position: pos)
|
||||
if pos == 0 || pos == promptLength - 1 || pos % 256 == 0 {
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "NaN at pos=\(pos)")
|
||||
print(" pos=\(pos): logits[0..3]=\(logits.prefix(3)) NaN=\(nanCount)")
|
||||
}
|
||||
lastLogits = logits
|
||||
}
|
||||
|
||||
var genTokens = tokens
|
||||
for i in 0..<3 {
|
||||
let logits = try model.forward(tokenId: genTokens.last ?? 0, position: genTokens.count - 1)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "NaN at gen step \(i)")
|
||||
var maxIdx = 0
|
||||
var maxVal = logits[0]
|
||||
for j in 1..<logits.count {
|
||||
if logits[j] > maxVal { maxVal = logits[j]; maxIdx = j }
|
||||
}
|
||||
genTokens.append(maxIdx)
|
||||
print(" gen[\(i)]: token=\(maxIdx) logit=\(maxVal)")
|
||||
}
|
||||
}
|
||||
|
||||
func testRepeatedTokensFullContext() throws {
|
||||
try XCTSkipIf(model == nil, "12B model not found")
|
||||
|
||||
let promptLength = maxCtx / 2
|
||||
for (pos, _) in (0..<promptLength).enumerated() {
|
||||
let logits = try model.forward(tokenId: 100, position: pos)
|
||||
if pos == 0 || pos == promptLength - 1 || pos % 256 == 0 {
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "NaN at pos=\(pos) (repeated tokens)")
|
||||
print(" repeat pos=\(pos): logits[0..3]=\(logits.prefix(3)) NaN=\(nanCount)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func testTokenIdBoundaries() throws {
|
||||
try XCTSkipIf(model == nil, "12B model not found")
|
||||
|
||||
let edgeTokens = [0, 1, 2, model.vocabSize - 1]
|
||||
for (pos, tokenId) in edgeTokens.enumerated() {
|
||||
let logits = try model.forward(tokenId: tokenId, position: pos)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "NaN for tokenId=\(tokenId)")
|
||||
print(" edge token=\(tokenId): logits[0..3]=\(logits.prefix(3)) NaN=\(nanCount)")
|
||||
}
|
||||
}
|
||||
|
||||
func testLongContext1024Tokens() throws {
|
||||
try XCTSkipIf(model == nil, "12B model not found")
|
||||
|
||||
let promptLength = 1024
|
||||
var tokens = [Int]()
|
||||
for i in 0..<promptLength {
|
||||
tokens.append(100 + (i % 1000))
|
||||
}
|
||||
|
||||
for (pos, tokenId) in tokens.enumerated() {
|
||||
let logits = try model.forward(tokenId: tokenId, position: pos)
|
||||
if pos == 0 || pos == promptLength - 1 || pos % 128 == 0 {
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "NaN at pos=\(pos)")
|
||||
print(" pos=\(pos): logits[0..3]=\(logits.prefix(3)) NaN=\(nanCount)")
|
||||
}
|
||||
}
|
||||
|
||||
var genTokens = tokens
|
||||
for i in 0..<5 {
|
||||
let logits = try model.forward(tokenId: genTokens.last ?? 0, position: genTokens.count - 1)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "NaN at gen step \(i)")
|
||||
var maxIdx = 0
|
||||
var maxVal = logits[0]
|
||||
for j in 1..<logits.count {
|
||||
if logits[j] > maxVal { maxVal = logits[j]; maxIdx = j }
|
||||
}
|
||||
genTokens.append(maxIdx)
|
||||
print(" gen[\(i)]: token=\(maxIdx) logit=\(maxVal)")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
import XCTest
|
||||
@testable import MarkBase
|
||||
|
||||
final class Model12BTest: XCTestCase {
|
||||
|
||||
var engine: MarkBaseEngine!
|
||||
var model: E4BModel!
|
||||
let modelDir = "/Users/accusys/MarkBaseEngine/models/gemma-4-12b-it-4bit"
|
||||
let maxCtx = 64
|
||||
|
||||
override func setUp() {
|
||||
super.setUp()
|
||||
guard FileManager.default.fileExists(atPath: modelDir + "/model.safetensors.index.json") else {
|
||||
return
|
||||
}
|
||||
engine = try? MarkBaseEngine(autoCompile: true)
|
||||
model = try? E4BModel(modelDir: modelDir, engine: engine, maxContextLength: maxCtx)
|
||||
}
|
||||
|
||||
func testModelLoads() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-12b-it-4bit model not found")
|
||||
XCTAssertNotNil(model)
|
||||
XCTAssertEqual(model.hiddenSize, 3840)
|
||||
XCTAssertEqual(model.numHiddenLayers, 48)
|
||||
XCTAssertEqual(model.vocabSize, 262144)
|
||||
}
|
||||
|
||||
func testBosTokenLogitsNoNaN() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-12b-it-4bit model not found")
|
||||
let logits = try model.forward(tokenId: 2, position: 0)
|
||||
XCTAssertEqual(logits.count, model.vocabSize)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN values in logits")
|
||||
}
|
||||
|
||||
func testLogitSoftcapping() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-12b-it-4bit model not found")
|
||||
let logits = try model.forward(tokenId: 2, position: 0)
|
||||
let softcap: Float = 30.0
|
||||
for logit in logits {
|
||||
XCTAssertLessThanOrEqual(abs(logit), softcap + 0.1,
|
||||
"Logit \(logit) exceeds softcap \(softcap)")
|
||||
}
|
||||
}
|
||||
|
||||
func testMultipleTokensProduceDifferentLogits() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-12b-it-4bit model not found")
|
||||
let tokens = [2, 100, 1000]
|
||||
for (pos, tokenId) in tokens.enumerated() {
|
||||
let logits = try model.forward(tokenId: tokenId, position: pos)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "NaN for token=\(tokenId) pos=\(pos)")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
import XCTest
|
||||
@testable import MarkBase
|
||||
|
||||
final class Model26BTest: XCTestCase {
|
||||
|
||||
var engine: MarkBaseEngine!
|
||||
var model: E4BModel!
|
||||
let modelDir = "/Users/accusys/MarkBaseEngine/models/gemma-4-26b-standard"
|
||||
let maxCtx = 128
|
||||
|
||||
override func setUp() {
|
||||
super.setUp()
|
||||
guard FileManager.default.fileExists(atPath: modelDir + "/model.safetensors") else {
|
||||
return
|
||||
}
|
||||
engine = try? MarkBaseEngine(autoCompile: true)
|
||||
model = try? E4BModel(modelDir: modelDir, engine: engine, maxContextLength: maxCtx)
|
||||
}
|
||||
|
||||
func testModelLoads() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-26b-standard model not found")
|
||||
XCTAssertNotNil(model)
|
||||
XCTAssertEqual(model.hiddenSize, 2816)
|
||||
XCTAssertEqual(model.numHiddenLayers, 30)
|
||||
XCTAssertEqual(model.vocabSize, 262144)
|
||||
}
|
||||
|
||||
func testBosTokenLogitsNoNaN() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-26b-standard model not found")
|
||||
let logits = try model.forward(tokenId: 2, position: 0)
|
||||
XCTAssertEqual(logits.count, model.vocabSize)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN values in logits")
|
||||
}
|
||||
|
||||
func testLogitsNotAllSaturated() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-26b-standard model not found")
|
||||
let logits = try model.forward(tokenId: 2, position: 0)
|
||||
// 26B has no softcapping, so logits should have variation
|
||||
let uniqueCount = Set(logits.map { round($0 * 10) / 10 }).count
|
||||
XCTAssertGreaterThan(uniqueCount, 100, "Logits should have meaningful variation")
|
||||
}
|
||||
|
||||
func testLogitsReasonableRange() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-26b-standard model not found")
|
||||
let logits = try model.forward(tokenId: 2, position: 0)
|
||||
let maxVal = logits.max() ?? 0
|
||||
let minVal = logits.min() ?? 0
|
||||
XCTAssertGreaterThan(maxVal, -100)
|
||||
XCTAssertLessThan(maxVal, 100000)
|
||||
XCTAssertGreaterThan(minVal, -100000)
|
||||
XCTAssertLessThan(minVal, 25000)
|
||||
XCTAssertGreaterThan(maxVal, minVal, "Logits should have dynamic range")
|
||||
}
|
||||
|
||||
func testMultipleTokensProduceDifferentLogits() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-26b-standard model not found")
|
||||
let tokens = [2, 100, 1000, 10000]
|
||||
for (pos, tokenId) in tokens.enumerated() {
|
||||
let logits = try model.forward(tokenId: tokenId, position: pos)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "NaN for token=\(tokenId) pos=\(pos)")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
import XCTest
|
||||
@testable import MarkBase
|
||||
|
||||
final class Model31BTest: XCTestCase {
|
||||
|
||||
var engine: MarkBaseEngine!
|
||||
var model: E4BModel!
|
||||
let modelDir = "/Users/accusys/MarkBaseEngine/models/gemma-4-31b-it-4bit"
|
||||
let maxCtx = 64
|
||||
|
||||
override func setUp() {
|
||||
super.setUp()
|
||||
guard FileManager.default.fileExists(atPath: modelDir + "/model.safetensors.index.json") else {
|
||||
return
|
||||
}
|
||||
engine = try? MarkBaseEngine(autoCompile: true)
|
||||
model = try? E4BModel(modelDir: modelDir, engine: engine, maxContextLength: maxCtx)
|
||||
}
|
||||
|
||||
func testModelLoads() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-31b-it-4bit model not found")
|
||||
XCTAssertNotNil(model)
|
||||
XCTAssertEqual(model.hiddenSize, 5376)
|
||||
XCTAssertEqual(model.numHiddenLayers, 60)
|
||||
XCTAssertEqual(model.vocabSize, 262144)
|
||||
}
|
||||
|
||||
func testBosTokenLogitsNoNaN() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-31b-it-4bit model not found")
|
||||
let logits = try model.forward(tokenId: 2, position: 0)
|
||||
XCTAssertEqual(logits.count, model.vocabSize)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN values in logits")
|
||||
}
|
||||
|
||||
func testLogitSoftcapping() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-31b-it-4bit model not found")
|
||||
let logits = try model.forward(tokenId: 2, position: 0)
|
||||
let softcap: Float = 30.0
|
||||
for logit in logits {
|
||||
XCTAssertLessThanOrEqual(abs(logit), softcap + 0.1,
|
||||
"Logit \(logit) exceeds softcap \(softcap)")
|
||||
}
|
||||
}
|
||||
|
||||
func testMultipleTokensProduceDifferentLogits() throws {
|
||||
try XCTSkipIf(model == nil, "gemma-4-31b-it-4bit model not found")
|
||||
let tokens = [2, 100, 1000]
|
||||
for (pos, tokenId) in tokens.enumerated() {
|
||||
let logits = try model.forward(tokenId: tokenId, position: pos)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "NaN for token=\(tokenId) pos=\(pos)")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
import XCTest
|
||||
@testable import MarkBase
|
||||
|
||||
final class ModelTest: XCTestCase {
|
||||
|
||||
var engine: MarkBaseEngine!
|
||||
var model: E4BModel!
|
||||
let modelDir = "/Users/accusys/MarkBaseEngine/models/E4B-MarkBase"
|
||||
let maxCtx = 256
|
||||
|
||||
override func setUp() {
|
||||
super.setUp()
|
||||
guard FileManager.default.fileExists(atPath: modelDir + "/model.safetensors") else {
|
||||
return
|
||||
}
|
||||
engine = try? MarkBaseEngine(autoCompile: true)
|
||||
model = try? E4BModel(modelDir: modelDir, engine: engine, maxContextLength: maxCtx)
|
||||
}
|
||||
|
||||
// MARK: - Model Loading
|
||||
|
||||
func testModelLoads() throws {
|
||||
try XCTSkipIf(model == nil, "E4B-MarkBase model not found")
|
||||
XCTAssertNotNil(model)
|
||||
XCTAssertEqual(model.vocabSize, 262144)
|
||||
XCTAssertEqual(model.hiddenSize, 2560)
|
||||
XCTAssertEqual(model.numHiddenLayers, 42)
|
||||
}
|
||||
|
||||
// MARK: - Forward Pass
|
||||
|
||||
func testBosTokenLogits() throws {
|
||||
try XCTSkipIf(model == nil, "E4B-MarkBase model not found")
|
||||
let logits = try model.forward(tokenId: 2, position: 0)
|
||||
XCTAssertEqual(logits.count, model.vocabSize)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN values in logits")
|
||||
XCTAssertGreaterThan(logits.max() ?? -Float.infinity, -50)
|
||||
XCTAssertLessThan(logits.max() ?? Float.infinity, 50)
|
||||
XCTAssertGreaterThan(logits.min() ?? -Float.infinity, -50)
|
||||
XCTAssertLessThan(logits.min() ?? Float.infinity, 50)
|
||||
}
|
||||
|
||||
func testLogitSoftcapping() throws {
|
||||
try XCTSkipIf(model == nil, "E4B-MarkBase model not found")
|
||||
let logits = try model.forward(tokenId: 2, position: 0)
|
||||
let softcap: Float = 30.0
|
||||
for logit in logits {
|
||||
XCTAssertLessThanOrEqual(abs(logit), softcap + 1e-3,
|
||||
"Logit \(logit) exceeds softcap \(softcap)")
|
||||
}
|
||||
}
|
||||
|
||||
func testMultipleTokensDeterministic() throws {
|
||||
try XCTSkipIf(model == nil, "E4B-MarkBase model not found")
|
||||
let tokens = [2, 1024, 2048, 4096]
|
||||
var allLogits: [[Float]] = []
|
||||
for (pos, tokenId) in tokens.enumerated() {
|
||||
let logits = try model.forward(tokenId: tokenId, position: pos)
|
||||
allLogits.append(logits)
|
||||
}
|
||||
XCTAssertEqual(allLogits.count, tokens.count)
|
||||
for logits in allLogits {
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN values in logits")
|
||||
}
|
||||
}
|
||||
|
||||
func testDeterministicOutput() throws {
|
||||
try XCTSkipIf(model == nil, "E4B-MarkBase model not found")
|
||||
let r1 = try model.forward(tokenId: 99, position: 0)
|
||||
let r2 = try model.forward(tokenId: 99, position: 0)
|
||||
XCTAssertEqual(r1.count, r2.count)
|
||||
let differences = zip(r1, r2).map { abs($0 - $1) }
|
||||
let maxDiff = differences.max() ?? 0
|
||||
let avgDiff = differences.reduce(0, +) / Float(differences.count)
|
||||
XCTAssertLessThan(maxDiff, 5.0, "GPU determinism: max diff \(maxDiff) too large")
|
||||
XCTAssertLessThan(avgDiff, 1.0, "GPU determinism: avg diff \(avgDiff) too large")
|
||||
}
|
||||
|
||||
func testKVCacheIncrements() throws {
|
||||
try XCTSkipIf(model == nil, "E4B-MarkBase model not found")
|
||||
let r0 = try model.forward(tokenId: 2, position: 0)
|
||||
let r1 = try model.forward(tokenId: 1024, position: 1)
|
||||
let r2 = try model.forward(tokenId: 2048, position: 2)
|
||||
XCTAssertFalse(r0.elementsEqual(r1))
|
||||
XCTAssertFalse(r1.elementsEqual(r2))
|
||||
for logits in [r0, r1, r2] {
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN values in logits")
|
||||
}
|
||||
}
|
||||
|
||||
func testDifferentTokensDifferentLogits() throws {
|
||||
try XCTSkipIf(model == nil, "E4B-MarkBase model not found")
|
||||
let tokenA: [Float] = try model.forward(tokenId: 100, position: 0)
|
||||
let tokenB: [Float] = try model.forward(tokenId: 200, position: 0)
|
||||
XCTAssertNotEqual(tokenA, tokenB)
|
||||
}
|
||||
|
||||
func testRandomTokenId() throws {
|
||||
try XCTSkipIf(model == nil, "E4B-MarkBase model not found")
|
||||
for tokenId in [0, 1, 100, 1000, 10000, 100000] {
|
||||
let logits = try model.forward(tokenId: tokenId, position: 0)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN for tokenId=\(tokenId)")
|
||||
XCTAssertEqual(logits.count, model.vocabSize)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Batched context test
|
||||
|
||||
func testFullContextForward() throws {
|
||||
try XCTSkipIf(model == nil, "E4B-MarkBase model not found")
|
||||
let promptTokens = [2] + Array(repeating: 1024, count: 32)
|
||||
for (pos, tokenId) in promptTokens.enumerated() {
|
||||
let logits = try model.forward(tokenId: tokenId, position: pos)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "NaN at position \(pos)")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,143 @@
|
||||
import XCTest
|
||||
@testable import MarkBase
|
||||
|
||||
final class Multimodal12BTest: XCTestCase {
|
||||
|
||||
var engine: MarkBaseEngine!
|
||||
var multimodal: MultimodalModel!
|
||||
let modelDir = "/Users/accusys/MarkBaseEngine/models/gemma-4-12b-it-4bit"
|
||||
let maxCtx = 64
|
||||
|
||||
override func setUp() {
|
||||
super.setUp()
|
||||
guard FileManager.default.fileExists(atPath: modelDir + "/model.safetensors.index.json") else {
|
||||
return
|
||||
}
|
||||
engine = try? MarkBaseEngine(autoCompile: true)
|
||||
multimodal = try? MultimodalModel(modelDir: modelDir, engine: engine, maxContextLength: maxCtx)
|
||||
}
|
||||
|
||||
func testModelLoads() throws {
|
||||
try XCTSkipIf(multimodal == nil, "12B model not found")
|
||||
XCTAssertEqual(multimodal!.textModel.hiddenSize, 3840)
|
||||
XCTAssertEqual(multimodal!.textModel.numHiddenLayers, 48)
|
||||
XCTAssertNotNil(multimodal!.visionTower, "VisionTower12B should load")
|
||||
XCTAssertNotNil(multimodal!.audioTower, "AudioTower12B should load")
|
||||
}
|
||||
|
||||
func testVisionTowerForward() throws {
|
||||
try XCTSkipIf(multimodal?.visionTower == nil, "Vision tower not loaded")
|
||||
let tower = multimodal!.visionTower!
|
||||
let numPatches = 8
|
||||
let patchDim = tower.patchDim
|
||||
|
||||
var patches = [Float](repeating: 0, count: numPatches * patchDim)
|
||||
for i in 0..<patches.count { patches[i] = Float.random(in: -0.5...0.5) }
|
||||
|
||||
let inputBuf = engine.device.makeBuffer(bytes: patches, length: patches.count * 4)!
|
||||
let outBuf = engine.device.makeBuffer(length: numPatches * tower.hiddenDim * 4)!
|
||||
|
||||
try tower.forward(patchEmbeddings: inputBuf, numPatches: numPatches, outputBuffer: outBuf)
|
||||
|
||||
let out = engine.readFloats(from: outBuf, count: numPatches * tower.hiddenDim)
|
||||
let nanCount = out.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN in vision output")
|
||||
|
||||
let maxAbs = out.map { abs($0) }.max() ?? 0
|
||||
XCTAssertLessThan(maxAbs, 1e6, "Vision output magnitude should be reasonable")
|
||||
XCTAssertGreaterThan(maxAbs, 0, "Vision output should have non-zero values")
|
||||
}
|
||||
|
||||
func testAudioTowerForward() throws {
|
||||
try XCTSkipIf(multimodal?.audioTower == nil, "Audio tower not loaded")
|
||||
let tower = multimodal!.audioTower!
|
||||
let numFrames = 16
|
||||
|
||||
var features = [Float](repeating: 0, count: numFrames * 640)
|
||||
for i in 0..<features.count { features[i] = Float.random(in: -1.0...1.0) }
|
||||
|
||||
let inputBuf = engine.device.makeBuffer(bytes: features, length: features.count * 4)!
|
||||
let outBuf = engine.device.makeBuffer(length: numFrames * tower.outDim * 4)!
|
||||
|
||||
try tower.forward(inputBuffer: inputBuf, seqLen: numFrames, outputBuffer: outBuf)
|
||||
|
||||
let out = engine.readFloats(from: outBuf, count: numFrames * tower.outDim)
|
||||
let nanCount = out.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN in audio output")
|
||||
}
|
||||
|
||||
func testTextBackboneForwardAfterVisionInjection() throws {
|
||||
try XCTSkipIf(multimodal?.visionTower == nil, "Vision tower not loaded")
|
||||
let tower = multimodal!.visionTower!
|
||||
let numPatches = 4
|
||||
let patchDim = tower.patchDim
|
||||
|
||||
var patches = [Float](repeating: 0, count: numPatches * patchDim)
|
||||
for i in 0..<patches.count { patches[i] = Float.random(in: -0.5...0.5) }
|
||||
|
||||
let inputBuf = engine.device.makeBuffer(bytes: patches, length: patches.count * 4)!
|
||||
let visionOut = engine.device.makeBuffer(length: numPatches * 3840 * 4)!
|
||||
try tower.forward(patchEmbeddings: inputBuf, numPatches: numPatches, outputBuffer: visionOut)
|
||||
|
||||
for i in 0..<numPatches {
|
||||
let offset = i * 3840 * 4
|
||||
let logits = try multimodal!.textModel.forwardFromHidden(
|
||||
hiddenBuffer: visionOut, offset: offset, position: i)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN after vision injection pos=\(i)")
|
||||
}
|
||||
}
|
||||
|
||||
func testTextBackboneForwardAfterAudioInjection() throws {
|
||||
try XCTSkipIf(multimodal?.audioTower == nil, "Audio tower not loaded")
|
||||
let tower = multimodal!.audioTower!
|
||||
let numFrames = 4
|
||||
|
||||
var features = [Float](repeating: 0, count: numFrames * 640)
|
||||
for i in 0..<features.count { features[i] = Float.random(in: -1.0...1.0) }
|
||||
|
||||
let inputBuf = engine.device.makeBuffer(bytes: features, length: features.count * 4)!
|
||||
let audioOut = engine.device.makeBuffer(length: numFrames * 3840 * 4)!
|
||||
try tower.forward(inputBuffer: inputBuf, seqLen: numFrames, outputBuffer: audioOut)
|
||||
|
||||
for i in 0..<numFrames {
|
||||
let offset = i * 3840 * 4
|
||||
let logits = try multimodal!.textModel.forwardFromHidden(
|
||||
hiddenBuffer: audioOut, offset: offset, position: i)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN after audio injection pos=\(i)")
|
||||
}
|
||||
}
|
||||
|
||||
func testMultimodalInferenceGenerate() throws {
|
||||
try XCTSkipIf(multimodal?.visionTower == nil, "Vision tower not loaded")
|
||||
let inference = try MultimodalInference(model: multimodal!)
|
||||
|
||||
let numPatches = 8
|
||||
let patchDim = multimodal!.visionTower!.patchDim
|
||||
var patches = [Float](repeating: 0, count: numPatches * patchDim)
|
||||
for i in 0..<patches.count { patches[i] = Float.random(in: -0.5...0.5) }
|
||||
|
||||
let audioDim = 640
|
||||
var audioFeatures = [[Float]]()
|
||||
for _ in 0..<32 {
|
||||
var frame = [Float](repeating: 0, count: audioDim)
|
||||
for j in 0..<audioDim { frame[j] = Float.random(in: -1.0...1.0) }
|
||||
audioFeatures.append(frame)
|
||||
}
|
||||
|
||||
let result = try inference.generate(
|
||||
textTokens: [2],
|
||||
audioFeatures: audioFeatures,
|
||||
imagePatches: patches,
|
||||
numImagePatches: numPatches,
|
||||
maxTokens: 5
|
||||
)
|
||||
|
||||
XCTAssertGreaterThan(result.count, 1, "Should generate at least one token")
|
||||
for token in result {
|
||||
XCTAssertGreaterThanOrEqual(token, 0, "Token ID should be non-negative")
|
||||
XCTAssertLessThan(token, multimodal!.textModel.vocabSize, "Token ID should be within vocab range")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,118 @@
|
||||
import XCTest
|
||||
@testable import MarkBase
|
||||
|
||||
final class MultimodalE4BTest: XCTestCase {
|
||||
|
||||
var engine: MarkBaseEngine!
|
||||
var multimodal: MultimodalModel!
|
||||
let modelDir = "/Users/accusys/MarkBaseEngine/models/E4B-MarkBase"
|
||||
let maxCtx = 64
|
||||
|
||||
override func setUp() {
|
||||
super.setUp()
|
||||
guard FileManager.default.fileExists(atPath: modelDir + "/model.safetensors") else {
|
||||
return
|
||||
}
|
||||
engine = try? MarkBaseEngine(autoCompile: true)
|
||||
multimodal = try? MultimodalModel(modelDir: modelDir, engine: engine, maxContextLength: maxCtx)
|
||||
}
|
||||
|
||||
func testModelLoads() throws {
|
||||
try XCTSkipIf(multimodal == nil, "E4B-MarkBase not found")
|
||||
XCTAssertEqual(multimodal!.textModel.hiddenSize, 2560)
|
||||
XCTAssertNotNil(multimodal!.visionTowerFull, "Full VisionTower should load")
|
||||
XCTAssertNotNil(multimodal!.audioTowerFull, "Full AudioTower should load")
|
||||
}
|
||||
|
||||
func testVisionTowerForward() throws {
|
||||
try XCTSkipIf(multimodal?.visionTowerFull == nil, "Vision tower not loaded")
|
||||
let tower = multimodal!.visionTowerFull!
|
||||
let numPatches = 4
|
||||
let patchDim = 768
|
||||
let hs = tower.config.hiddenSize // 768
|
||||
|
||||
var patches = [Float](repeating: 0, count: numPatches * patchDim)
|
||||
for i in 0..<patches.count { patches[i] = Float.random(in: -0.5...0.5) }
|
||||
|
||||
let inputBuf = engine.device.makeBuffer(bytes: patches, length: patches.count * 4)!
|
||||
let outBuf = engine.device.makeBuffer(length: numPatches * hs * 4)!
|
||||
|
||||
try tower.forward(patchEmbeddings: inputBuf, numPatches: numPatches, outputBuffer: outBuf)
|
||||
|
||||
let out = engine.readFloats(from: outBuf, count: numPatches * hs)
|
||||
let nanCount = out.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN in vision output")
|
||||
let maxAbs = out.map { abs($0) }.max() ?? 0
|
||||
XCTAssertGreaterThan(maxAbs, 0, "Vision output should have non-zero values")
|
||||
print(" vision: maxAbs=\(maxAbs)")
|
||||
}
|
||||
|
||||
func testAudioTowerForward() throws {
|
||||
try XCTSkipIf(multimodal?.audioTowerFull == nil, "Audio tower not loaded")
|
||||
let tower = multimodal!.audioTowerFull!
|
||||
let numFrames = 16
|
||||
let audioDim = 128
|
||||
|
||||
var features = [Float](repeating: 0, count: numFrames * audioDim)
|
||||
for i in 0..<features.count { features[i] = Float.random(in: -1.0...1.0) }
|
||||
|
||||
let inputBuf = engine.device.makeBuffer(bytes: features, length: features.count * 4)!
|
||||
let hs = tower.config.outputProjDims
|
||||
let outBuf = engine.device.makeBuffer(length: numFrames / 4 * hs * 4)!
|
||||
|
||||
try tower.forward(inputBuffer: inputBuf, seqLen: numFrames, outputBuffer: outBuf)
|
||||
|
||||
let out = engine.readFloats(from: outBuf, count: numFrames / 4 * hs)
|
||||
let nanCount = out.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN in audio output")
|
||||
let maxAbs = out.map { abs($0) }.max() ?? 0
|
||||
XCTAssertGreaterThan(maxAbs, 0, "Audio output should have non-zero values")
|
||||
print(" audio: maxAbs=\(maxAbs)")
|
||||
}
|
||||
|
||||
func testTextBackboneForwardAfterVisionInjection() throws {
|
||||
try XCTSkipIf(multimodal?.visionTowerFull == nil, "Vision tower not loaded")
|
||||
let tower = multimodal!.visionTowerFull!
|
||||
let numPatches = 4
|
||||
let patchDim = 768
|
||||
let hs = tower.config.hiddenSize
|
||||
|
||||
var patches = [Float](repeating: 0, count: numPatches * patchDim)
|
||||
for i in 0..<patches.count { patches[i] = Float.random(in: -0.5...0.5) }
|
||||
|
||||
let inputBuf = engine.device.makeBuffer(bytes: patches, length: patches.count * 4)!
|
||||
let visionOut = engine.device.makeBuffer(length: numPatches * multimodal!.textModel.hiddenSize * 4)!
|
||||
try tower.forward(patchEmbeddings: inputBuf, numPatches: numPatches, outputBuffer: visionOut)
|
||||
|
||||
for i in 0..<numPatches {
|
||||
let offset = i * multimodal!.textModel.hiddenSize * 4
|
||||
let logits = try multimodal!.textModel.forwardFromHidden(
|
||||
hiddenBuffer: visionOut, offset: offset, position: i)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN after vision injection pos=\(i)")
|
||||
}
|
||||
}
|
||||
|
||||
func testTextBackboneForwardAfterAudioInjection() throws {
|
||||
try XCTSkipIf(multimodal?.audioTowerFull == nil, "Audio tower not loaded")
|
||||
let tower = multimodal!.audioTowerFull!
|
||||
let numFrames = 16
|
||||
let audioDim = 128
|
||||
|
||||
var features = [Float](repeating: 0, count: numFrames * audioDim)
|
||||
for i in 0..<features.count { features[i] = Float.random(in: -1.0...1.0) }
|
||||
|
||||
let inputBuf = engine.device.makeBuffer(bytes: features, length: features.count * 4)!
|
||||
let hs = tower.config.outputProjDims
|
||||
let audioOut = engine.device.makeBuffer(length: numFrames / 4 * hs * 4)!
|
||||
try tower.forward(inputBuffer: inputBuf, seqLen: numFrames, outputBuffer: audioOut)
|
||||
|
||||
for i in 0..<min(4, numFrames / 4) {
|
||||
let offset = i * hs * 4
|
||||
let logits = try multimodal!.textModel.forwardFromHidden(
|
||||
hiddenBuffer: audioOut, offset: offset, position: i)
|
||||
let nanCount = logits.filter { $0.isNaN }.count
|
||||
XCTAssertEqual(nanCount, 0, "No NaN after audio injection pos=\(i)")
|
||||
}
|
||||
}
|
||||
}
|
||||
+29
-5
@@ -25,6 +25,30 @@
|
||||
"model": null,
|
||||
"timeout_seconds": 30,
|
||||
"schedule": "always"
|
||||
},
|
||||
"01_Model/ModelTest.swift": {
|
||||
"tier": 1,
|
||||
"memory_gb": 6,
|
||||
"gpu": true,
|
||||
"model": "E4B-MarkBase",
|
||||
"timeout_seconds": 180,
|
||||
"schedule": "on_demand"
|
||||
},
|
||||
"01_Model/Model26BTest.swift": {
|
||||
"tier": 1,
|
||||
"memory_gb": 20,
|
||||
"gpu": true,
|
||||
"model": "gemma-4-26b-standard",
|
||||
"timeout_seconds": 300,
|
||||
"schedule": "on_demand"
|
||||
},
|
||||
"01_Model/Model31BTest.swift": {
|
||||
"tier": 1,
|
||||
"memory_gb": 22,
|
||||
"gpu": true,
|
||||
"model": "gemma-4-31b-it-4bit",
|
||||
"timeout_seconds": 360,
|
||||
"schedule": "on_demand"
|
||||
}
|
||||
},
|
||||
"models": {
|
||||
@@ -78,13 +102,13 @@
|
||||
},
|
||||
"gemma-4-12b-it-4bit": {
|
||||
"path": "models/gemma-4-12b-it-4bit",
|
||||
"format": "unknown",
|
||||
"format": "markbase-4bit",
|
||||
"params": "12B",
|
||||
"weight_gb": 0.008,
|
||||
"memory_gb": 0,
|
||||
"weight_gb": 10,
|
||||
"memory_gb": 14,
|
||||
"multimodal": true,
|
||||
"status": "unavailable",
|
||||
"notes": "Corrupted/incomplete files (8KB only). Full 4-bit 12B needed."
|
||||
"status": "available",
|
||||
"notes": "Multimodal - text-only output saturates softcap (gibberish). Full model files (blobs) present."
|
||||
},
|
||||
"12B-it-MLX-8bit": {
|
||||
"path": "models/12B-it-MLX-8bit",
|
||||
|
||||
Reference in New Issue
Block a user