8a66b9086a
- Started from ac75faa (initial E4B-MarkBase integration)
- Kept Sources/ (all engine code) + Package.swift + .gitignore
- Removed all ad-hoc tests, documentation, scripts, Python files
- Added Tests/00_Unit/ (MathTest, TokenizerTest, SamplerTest)
- Added .gitea/workflows/ci.yaml (build + unit tests + lint)
- Added Scripts/check_resources.sh (memory-aware test runner)
- Added Tests/Manifest.json (resource requirements for all tests)
- Focus: 4-bit quantized models only
305 lines
14 KiB
Swift
305 lines
14 KiB
Swift
import Metal
|
|
|
|
// E2B vision tower uses bfloat16 weights (not quantized)
|
|
// Linear weights are full bfloat16, converted to float32
|
|
|
|
public struct VisionLayerWeightsE2B {
|
|
public let inputLayernorm: MTLBuffer
|
|
public let postAttentionLayernorm: MTLBuffer
|
|
public let preFeedforwardLayernorm: MTLBuffer
|
|
public let postFeedforwardLayernorm: MTLBuffer
|
|
|
|
public let selfAttnQProj: MTLBuffer
|
|
public let selfAttnKProj: MTLBuffer
|
|
public let selfAttnVProj: MTLBuffer
|
|
public let selfAttnOProj: MTLBuffer
|
|
public let qNorm: MTLBuffer
|
|
public let kNorm: MTLBuffer
|
|
|
|
public let mlpGateProj: MTLBuffer
|
|
public let mlpUpProj: MTLBuffer
|
|
public let mlpDownProj: MTLBuffer
|
|
|
|
private static func buffer(_ device: MTLDevice, _ floats: [String: [Float]], _ key: String) throws -> MTLBuffer {
|
|
guard let f = floats[key] else {
|
|
throw WeightError.tensorNotFound(key)
|
|
}
|
|
return device.makeBuffer(bytes: f, length: f.count * MemoryLayout<Float>.stride)!
|
|
}
|
|
|
|
public init(device: MTLDevice, layerIdx: Int, floats: [String: [Float]]) throws {
|
|
let pfx = "vision_tower.encoder.layers.\(layerIdx)."
|
|
|
|
inputLayernorm = try Self.buffer(device, floats, pfx + "input_layernorm.weight")
|
|
postAttentionLayernorm = try Self.buffer(device, floats, pfx + "post_attention_layernorm.weight")
|
|
preFeedforwardLayernorm = try Self.buffer(device, floats, pfx + "pre_feedforward_layernorm.weight")
|
|
postFeedforwardLayernorm = try Self.buffer(device, floats, pfx + "post_feedforward_layernorm.weight")
|
|
|
|
qNorm = try Self.buffer(device, floats, pfx + "self_attn.q_norm.weight")
|
|
kNorm = try Self.buffer(device, floats, pfx + "self_attn.k_norm.weight")
|
|
|
|
// Linear weights - use .linear.weight suffix for E2B
|
|
selfAttnQProj = try Self.buffer(device, floats, pfx + "self_attn.q_proj.linear.weight")
|
|
selfAttnKProj = try Self.buffer(device, floats, pfx + "self_attn.k_proj.linear.weight")
|
|
selfAttnVProj = try Self.buffer(device, floats, pfx + "self_attn.v_proj.linear.weight")
|
|
selfAttnOProj = try Self.buffer(device, floats, pfx + "self_attn.o_proj.linear.weight")
|
|
|
|
mlpGateProj = try Self.buffer(device, floats, pfx + "mlp.gate_proj.linear.weight")
|
|
mlpUpProj = try Self.buffer(device, floats, pfx + "mlp.up_proj.linear.weight")
|
|
mlpDownProj = try Self.buffer(device, floats, pfx + "mlp.down_proj.linear.weight")
|
|
}
|
|
}
|
|
|
|
public struct VisionWeightsE2B {
|
|
public let inputProjWeight: MTLBuffer
|
|
public let positionEmbedding: MTLBuffer
|
|
|
|
public let embeddingProjectionWeight: MTLBuffer
|
|
public let embeddingProjectionScales: MTLBuffer
|
|
public let embeddingProjectionBiases: MTLBuffer
|
|
|
|
public let layers: [VisionLayerWeightsE2B]
|
|
|
|
private static func buffer(_ device: MTLDevice, _ floats: [String: [Float]], _ key: String) throws -> MTLBuffer {
|
|
guard let f = floats[key] else {
|
|
throw WeightError.tensorNotFound(key)
|
|
}
|
|
return device.makeBuffer(bytes: f, length: f.count * MemoryLayout<Float>.stride)!
|
|
}
|
|
|
|
public init(device: MTLDevice, config: VisionConfig, floats: [String: [Float]], tensors: [String: Data]) throws {
|
|
let pfx = "vision_tower.patch_embedder."
|
|
|
|
inputProjWeight = try Self.buffer(device, floats, pfx + "input_proj.weight")
|
|
positionEmbedding = try Self.buffer(device, floats, pfx + "position_embedding_table")
|
|
|
|
// Embedding projection - uint32 quantized (same as E4B)
|
|
let ep = "embed_vision.embedding_projection"
|
|
guard let epWeightData = tensors[ep + ".weight"] else {
|
|
throw WeightError.tensorNotFound("embedding_projection.weight")
|
|
}
|
|
embeddingProjectionWeight = epWeightData.withUnsafeBytes { ptr in
|
|
device.makeBuffer(bytes: ptr.baseAddress!, length: epWeightData.count)!
|
|
}
|
|
embeddingProjectionScales = try Self.buffer(device, floats, ep + ".scales")
|
|
embeddingProjectionBiases = try Self.buffer(device, floats, ep + ".biases")
|
|
|
|
var loadedLayers: [VisionLayerWeightsE2B] = []
|
|
for i in 0..<config.numHiddenLayers {
|
|
loadedLayers.append(try VisionLayerWeightsE2B(device: device, layerIdx: i, floats: floats))
|
|
}
|
|
layers = loadedLayers
|
|
}
|
|
}
|
|
|
|
public final class VisionTowerE2B {
|
|
public let config: VisionConfig
|
|
public let engine: MarkBaseEngine
|
|
public let weights: VisionWeightsE2B
|
|
|
|
private var qBuffer: MTLBuffer
|
|
private var kBuffer: MTLBuffer
|
|
private var vBuffer: MTLBuffer
|
|
private var attnOutBuffer: MTLBuffer
|
|
private var mlpBuffer: MTLBuffer
|
|
private var tempBuffer: MTLBuffer
|
|
private var normBuffer: MTLBuffer
|
|
private var residualBuffer: MTLBuffer
|
|
|
|
public init(config: VisionConfig, engine: MarkBaseEngine, weights: VisionWeightsE2B) throws {
|
|
self.config = config
|
|
self.engine = engine
|
|
self.weights = weights
|
|
|
|
let device = engine.device
|
|
let maxPatches = 4096
|
|
let hiddenSize = config.hiddenSize
|
|
let intermediateSize = config.intermediateSize
|
|
|
|
qBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
|
|
kBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
|
|
vBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
|
|
attnOutBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
|
|
mlpBuffer = device.makeBuffer(length: intermediateSize * maxPatches * 4)!
|
|
tempBuffer = device.makeBuffer(length: max(hiddenSize, intermediateSize) * maxPatches * 4)!
|
|
normBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
|
|
residualBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
|
|
}
|
|
|
|
public func forward(patchEmbeddings: MTLBuffer, numPatches: Int, outputBuffer: MTLBuffer) throws {
|
|
var current = patchEmbeddings
|
|
let cmdBuf = engine.commandQueue.makeCommandBuffer()!
|
|
|
|
// Input projection: [numPatches, 768] -> [numPatches, 768] using float32 matmul
|
|
current = try applyFloatMatmul(input: current, weight: weights.inputProjWeight,
|
|
inDim: config.hiddenSize, outDim: config.hiddenSize,
|
|
seqLen: numPatches, output: tempBuffer, cmdBuf: cmdBuf)
|
|
|
|
// Add position embedding
|
|
current = try addPositionEmbedding(input: current, numPatches: numPatches, cmdBuf: cmdBuf)
|
|
|
|
// Vision layers (16 layers)
|
|
for layerWeights in weights.layers {
|
|
current = try applyLayer(input: current, weights: layerWeights, numPatches: numPatches, cmdBuf: cmdBuf)
|
|
}
|
|
|
|
// Embedding projection: quantized matmul [numPatches, 768] -> [numPatches, 2560]
|
|
try applyEmbeddingProjection(input: current, numPatches: numPatches, output: outputBuffer, cmdBuf: cmdBuf)
|
|
|
|
cmdBuf.commit()
|
|
cmdBuf.waitUntilCompleted()
|
|
}
|
|
|
|
private func applyFloatMatmul(input: MTLBuffer, weight: MTLBuffer,
|
|
inDim: Int, outDim: Int, seqLen: Int,
|
|
output: MTLBuffer, cmdBuf: MTLCommandBuffer) throws -> MTLBuffer {
|
|
// Use quantized_matmul_seq with float32 weights (no scales/biases needed)
|
|
// For float32, we can use a simple matmul kernel
|
|
let pso = try engine.pipeline(named: "quantized_matmul_seq")
|
|
let enc = cmdBuf.makeComputeCommandEncoder()!
|
|
enc.setComputePipelineState(pso)
|
|
|
|
enc.setBuffer(input, offset: 0, index: 0)
|
|
enc.setBuffer(weight, offset: 0, index: 1)
|
|
// For float32 matmul, we need dummy scales/biases
|
|
let dummyScales = engine.device.makeBuffer(length: outDim * 4)!
|
|
let dummyBiases = engine.device.makeBuffer(length: outDim * 4)!
|
|
enc.setBuffer(dummyScales, offset: 0, index: 2)
|
|
enc.setBuffer(dummyBiases, offset: 0, index: 3)
|
|
enc.setBuffer(output, offset: 0, index: 4)
|
|
|
|
var inD = UInt32(inDim)
|
|
enc.setBytes(&inD, length: 4, index: 5)
|
|
var outD = UInt32(outDim)
|
|
enc.setBytes(&outD, length: 4, index: 6)
|
|
|
|
let grid = MTLSize(width: outDim * seqLen, height: 1, depth: 1)
|
|
let tg = engine.threadgroupSize1D(pso, count: outDim)
|
|
enc.dispatchThreads(grid, threadsPerThreadgroup: tg)
|
|
enc.endEncoding()
|
|
|
|
return output
|
|
}
|
|
|
|
private func addPositionEmbedding(input: MTLBuffer, numPatches: Int, cmdBuf: MTLCommandBuffer) throws -> MTLBuffer {
|
|
let output = normBuffer
|
|
let pso = try engine.pipeline(named: "vision_add_pos_embed")
|
|
let enc = cmdBuf.makeComputeCommandEncoder()!
|
|
enc.setComputePipelineState(pso)
|
|
|
|
enc.setBuffer(input, offset: 0, index: 0)
|
|
enc.setBuffer(weights.positionEmbedding, offset: 0, index: 1)
|
|
enc.setBuffer(output, offset: 0, index: 2)
|
|
|
|
var hd = UInt32(config.hiddenSize)
|
|
enc.setBytes(&hd, length: 4, index: 3)
|
|
var np = UInt32(numPatches)
|
|
enc.setBytes(&np, length: 4, index: 4)
|
|
|
|
let grid = MTLSize(width: config.hiddenSize, height: numPatches, depth: 1)
|
|
let tg = engine.threadgroupSize2D(pso, grid: (config.hiddenSize, numPatches))
|
|
enc.dispatchThreads(grid, threadsPerThreadgroup: tg)
|
|
enc.endEncoding()
|
|
|
|
return output
|
|
}
|
|
|
|
private func applyLayer(input: MTLBuffer, weights: VisionLayerWeightsE2B,
|
|
numPatches: Int, cmdBuf: MTLCommandBuffer) throws -> MTLBuffer {
|
|
// This is a placeholder - full implementation needs attention and MLP kernels
|
|
// For now, just return input unchanged
|
|
return input
|
|
}
|
|
|
|
private func applyEmbeddingProjection(input: MTLBuffer, numPatches: Int,
|
|
output: MTLBuffer, cmdBuf: MTLCommandBuffer) throws {
|
|
let pso = try engine.pipeline(named: "quantized_matmul_seq")
|
|
let enc = cmdBuf.makeComputeCommandEncoder()!
|
|
enc.setComputePipelineState(pso)
|
|
|
|
enc.setBuffer(input, offset: 0, index: 0)
|
|
enc.setBuffer(weights.embeddingProjectionWeight, offset: 0, index: 1)
|
|
enc.setBuffer(weights.embeddingProjectionScales, offset: 0, index: 2)
|
|
enc.setBuffer(weights.embeddingProjectionBiases, offset: 0, index: 3)
|
|
enc.setBuffer(output, offset: 0, index: 4)
|
|
|
|
var inD = UInt32(config.hiddenSize)
|
|
enc.setBytes(&inD, length: 4, index: 5)
|
|
var outD = UInt32(config.outputProjDims)
|
|
enc.setBytes(&outD, length: 4, index: 6)
|
|
|
|
let grid = MTLSize(width: config.outputProjDims * numPatches, height: 1, depth: 1)
|
|
let tg = engine.threadgroupSize1D(pso, count: config.outputProjDims)
|
|
enc.dispatchThreads(grid, threadsPerThreadgroup: tg)
|
|
enc.endEncoding()
|
|
}
|
|
}
|
|
|
|
// Helper function to load E2B vision tower with preload optimization
|
|
public func loadVisionTowerE2B(reader: SafeTensorsReader, config: VisionConfig,
|
|
engine: MarkBaseEngine) throws -> VisionTowerE2B {
|
|
print("Loading E2B Vision Tower with preload optimization...")
|
|
let startTime = Date()
|
|
|
|
// Collect all vision tensor names
|
|
let visionPrefix = "vision_tower."
|
|
let embedPrefix = "embed_vision."
|
|
let visionDescriptors = reader.allDescriptors().filter {
|
|
$0.name.hasPrefix(visionPrefix) || $0.name.hasPrefix(embedPrefix)
|
|
}
|
|
|
|
print(" Found \(visionDescriptors.count) vision tensors")
|
|
|
|
// Parallel preload all vision tensors
|
|
let dispatchGroup = DispatchGroup()
|
|
let loadQueue = DispatchQueue(label: "vision-preload", attributes: .concurrent)
|
|
var loadedData: [Data?] = Array(repeating: nil, count: visionDescriptors.count)
|
|
var loadErrors: [Error?] = Array(repeating: nil, count: visionDescriptors.count)
|
|
|
|
for (idx, desc) in visionDescriptors.enumerated() {
|
|
dispatchGroup.enter()
|
|
loadQueue.async {
|
|
do {
|
|
let data = try reader.read(tensor: desc)
|
|
loadedData[idx] = data
|
|
} catch {
|
|
loadErrors[idx] = error
|
|
}
|
|
dispatchGroup.leave()
|
|
}
|
|
}
|
|
|
|
dispatchGroup.wait()
|
|
|
|
// Check for errors
|
|
for (idx, error) in loadErrors.enumerated() {
|
|
if let err = error {
|
|
throw WeightError.readFailed("Failed to preload vision tensor \(visionDescriptors[idx].name): \(err)")
|
|
}
|
|
}
|
|
|
|
let preloadTime = Date().timeIntervalSince(startTime) * 1000
|
|
print(" ✓ Parallel preloaded \(visionDescriptors.count) vision tensors in \(String(format: "%.1f", preloadTime))ms")
|
|
|
|
// Convert to floats/tensors dictionaries (sequential, but from preloaded data)
|
|
var floats: [String: [Float]] = [:]
|
|
var tensors: [String: Data] = [:]
|
|
|
|
for (idx, desc) in visionDescriptors.enumerated() {
|
|
guard let data = loadedData[idx] else { continue }
|
|
let name = desc.name
|
|
if desc.dtype == .bf16 {
|
|
floats[name] = SafeTensorsReader.bf16ToFloat32(data)
|
|
} else if desc.dtype == .u32 {
|
|
tensors[name] = data
|
|
}
|
|
}
|
|
|
|
let weights = try VisionWeightsE2B(device: engine.device, config: config,
|
|
floats: floats, tensors: tensors)
|
|
|
|
let totalTime = Date().timeIntervalSince(startTime) * 1000
|
|
print(" ✓ E2B Vision Tower loaded in \(String(format: "%.1f", totalTime))ms")
|
|
|
|
return try VisionTowerE2B(config: config, engine: engine, weights: weights)
|
|
} |