Files
markbaseengine/Sources/MarkBase/Vision/VisionTowerE2B.swift
T
MarkBase Admin 8a66b9086a
CI / build (push) Waiting to run
CI / unit-tests (push) Blocked by required conditions
CI / lint (push) Blocked by required conditions
v2: Initial clean branch with unit tests + CI/CD pipeline
- Started from ac75faa (initial E4B-MarkBase integration)
- Kept Sources/ (all engine code) + Package.swift + .gitignore
- Removed all ad-hoc tests, documentation, scripts, Python files
- Added Tests/00_Unit/ (MathTest, TokenizerTest, SamplerTest)
- Added .gitea/workflows/ci.yaml (build + unit tests + lint)
- Added Scripts/check_resources.sh (memory-aware test runner)
- Added Tests/Manifest.json (resource requirements for all tests)
- Focus: 4-bit quantized models only
2026-07-05 13:29:25 +08:00

305 lines
14 KiB
Swift

import Metal
// E2B vision tower uses bfloat16 weights (not quantized)
// Linear weights are full bfloat16, converted to float32
public struct VisionLayerWeightsE2B {
public let inputLayernorm: MTLBuffer
public let postAttentionLayernorm: MTLBuffer
public let preFeedforwardLayernorm: MTLBuffer
public let postFeedforwardLayernorm: MTLBuffer
public let selfAttnQProj: MTLBuffer
public let selfAttnKProj: MTLBuffer
public let selfAttnVProj: MTLBuffer
public let selfAttnOProj: MTLBuffer
public let qNorm: MTLBuffer
public let kNorm: MTLBuffer
public let mlpGateProj: MTLBuffer
public let mlpUpProj: MTLBuffer
public let mlpDownProj: MTLBuffer
private static func buffer(_ device: MTLDevice, _ floats: [String: [Float]], _ key: String) throws -> MTLBuffer {
guard let f = floats[key] else {
throw WeightError.tensorNotFound(key)
}
return device.makeBuffer(bytes: f, length: f.count * MemoryLayout<Float>.stride)!
}
public init(device: MTLDevice, layerIdx: Int, floats: [String: [Float]]) throws {
let pfx = "vision_tower.encoder.layers.\(layerIdx)."
inputLayernorm = try Self.buffer(device, floats, pfx + "input_layernorm.weight")
postAttentionLayernorm = try Self.buffer(device, floats, pfx + "post_attention_layernorm.weight")
preFeedforwardLayernorm = try Self.buffer(device, floats, pfx + "pre_feedforward_layernorm.weight")
postFeedforwardLayernorm = try Self.buffer(device, floats, pfx + "post_feedforward_layernorm.weight")
qNorm = try Self.buffer(device, floats, pfx + "self_attn.q_norm.weight")
kNorm = try Self.buffer(device, floats, pfx + "self_attn.k_norm.weight")
// Linear weights - use .linear.weight suffix for E2B
selfAttnQProj = try Self.buffer(device, floats, pfx + "self_attn.q_proj.linear.weight")
selfAttnKProj = try Self.buffer(device, floats, pfx + "self_attn.k_proj.linear.weight")
selfAttnVProj = try Self.buffer(device, floats, pfx + "self_attn.v_proj.linear.weight")
selfAttnOProj = try Self.buffer(device, floats, pfx + "self_attn.o_proj.linear.weight")
mlpGateProj = try Self.buffer(device, floats, pfx + "mlp.gate_proj.linear.weight")
mlpUpProj = try Self.buffer(device, floats, pfx + "mlp.up_proj.linear.weight")
mlpDownProj = try Self.buffer(device, floats, pfx + "mlp.down_proj.linear.weight")
}
}
public struct VisionWeightsE2B {
public let inputProjWeight: MTLBuffer
public let positionEmbedding: MTLBuffer
public let embeddingProjectionWeight: MTLBuffer
public let embeddingProjectionScales: MTLBuffer
public let embeddingProjectionBiases: MTLBuffer
public let layers: [VisionLayerWeightsE2B]
private static func buffer(_ device: MTLDevice, _ floats: [String: [Float]], _ key: String) throws -> MTLBuffer {
guard let f = floats[key] else {
throw WeightError.tensorNotFound(key)
}
return device.makeBuffer(bytes: f, length: f.count * MemoryLayout<Float>.stride)!
}
public init(device: MTLDevice, config: VisionConfig, floats: [String: [Float]], tensors: [String: Data]) throws {
let pfx = "vision_tower.patch_embedder."
inputProjWeight = try Self.buffer(device, floats, pfx + "input_proj.weight")
positionEmbedding = try Self.buffer(device, floats, pfx + "position_embedding_table")
// Embedding projection - uint32 quantized (same as E4B)
let ep = "embed_vision.embedding_projection"
guard let epWeightData = tensors[ep + ".weight"] else {
throw WeightError.tensorNotFound("embedding_projection.weight")
}
embeddingProjectionWeight = epWeightData.withUnsafeBytes { ptr in
device.makeBuffer(bytes: ptr.baseAddress!, length: epWeightData.count)!
}
embeddingProjectionScales = try Self.buffer(device, floats, ep + ".scales")
embeddingProjectionBiases = try Self.buffer(device, floats, ep + ".biases")
var loadedLayers: [VisionLayerWeightsE2B] = []
for i in 0..<config.numHiddenLayers {
loadedLayers.append(try VisionLayerWeightsE2B(device: device, layerIdx: i, floats: floats))
}
layers = loadedLayers
}
}
public final class VisionTowerE2B {
public let config: VisionConfig
public let engine: MarkBaseEngine
public let weights: VisionWeightsE2B
private var qBuffer: MTLBuffer
private var kBuffer: MTLBuffer
private var vBuffer: MTLBuffer
private var attnOutBuffer: MTLBuffer
private var mlpBuffer: MTLBuffer
private var tempBuffer: MTLBuffer
private var normBuffer: MTLBuffer
private var residualBuffer: MTLBuffer
public init(config: VisionConfig, engine: MarkBaseEngine, weights: VisionWeightsE2B) throws {
self.config = config
self.engine = engine
self.weights = weights
let device = engine.device
let maxPatches = 4096
let hiddenSize = config.hiddenSize
let intermediateSize = config.intermediateSize
qBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
kBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
vBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
attnOutBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
mlpBuffer = device.makeBuffer(length: intermediateSize * maxPatches * 4)!
tempBuffer = device.makeBuffer(length: max(hiddenSize, intermediateSize) * maxPatches * 4)!
normBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
residualBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
}
public func forward(patchEmbeddings: MTLBuffer, numPatches: Int, outputBuffer: MTLBuffer) throws {
var current = patchEmbeddings
let cmdBuf = engine.commandQueue.makeCommandBuffer()!
// Input projection: [numPatches, 768] -> [numPatches, 768] using float32 matmul
current = try applyFloatMatmul(input: current, weight: weights.inputProjWeight,
inDim: config.hiddenSize, outDim: config.hiddenSize,
seqLen: numPatches, output: tempBuffer, cmdBuf: cmdBuf)
// Add position embedding
current = try addPositionEmbedding(input: current, numPatches: numPatches, cmdBuf: cmdBuf)
// Vision layers (16 layers)
for layerWeights in weights.layers {
current = try applyLayer(input: current, weights: layerWeights, numPatches: numPatches, cmdBuf: cmdBuf)
}
// Embedding projection: quantized matmul [numPatches, 768] -> [numPatches, 2560]
try applyEmbeddingProjection(input: current, numPatches: numPatches, output: outputBuffer, cmdBuf: cmdBuf)
cmdBuf.commit()
cmdBuf.waitUntilCompleted()
}
private func applyFloatMatmul(input: MTLBuffer, weight: MTLBuffer,
inDim: Int, outDim: Int, seqLen: Int,
output: MTLBuffer, cmdBuf: MTLCommandBuffer) throws -> MTLBuffer {
// Use quantized_matmul_seq with float32 weights (no scales/biases needed)
// For float32, we can use a simple matmul kernel
let pso = try engine.pipeline(named: "quantized_matmul_seq")
let enc = cmdBuf.makeComputeCommandEncoder()!
enc.setComputePipelineState(pso)
enc.setBuffer(input, offset: 0, index: 0)
enc.setBuffer(weight, offset: 0, index: 1)
// For float32 matmul, we need dummy scales/biases
let dummyScales = engine.device.makeBuffer(length: outDim * 4)!
let dummyBiases = engine.device.makeBuffer(length: outDim * 4)!
enc.setBuffer(dummyScales, offset: 0, index: 2)
enc.setBuffer(dummyBiases, offset: 0, index: 3)
enc.setBuffer(output, offset: 0, index: 4)
var inD = UInt32(inDim)
enc.setBytes(&inD, length: 4, index: 5)
var outD = UInt32(outDim)
enc.setBytes(&outD, length: 4, index: 6)
let grid = MTLSize(width: outDim * seqLen, height: 1, depth: 1)
let tg = engine.threadgroupSize1D(pso, count: outDim)
enc.dispatchThreads(grid, threadsPerThreadgroup: tg)
enc.endEncoding()
return output
}
private func addPositionEmbedding(input: MTLBuffer, numPatches: Int, cmdBuf: MTLCommandBuffer) throws -> MTLBuffer {
let output = normBuffer
let pso = try engine.pipeline(named: "vision_add_pos_embed")
let enc = cmdBuf.makeComputeCommandEncoder()!
enc.setComputePipelineState(pso)
enc.setBuffer(input, offset: 0, index: 0)
enc.setBuffer(weights.positionEmbedding, offset: 0, index: 1)
enc.setBuffer(output, offset: 0, index: 2)
var hd = UInt32(config.hiddenSize)
enc.setBytes(&hd, length: 4, index: 3)
var np = UInt32(numPatches)
enc.setBytes(&np, length: 4, index: 4)
let grid = MTLSize(width: config.hiddenSize, height: numPatches, depth: 1)
let tg = engine.threadgroupSize2D(pso, grid: (config.hiddenSize, numPatches))
enc.dispatchThreads(grid, threadsPerThreadgroup: tg)
enc.endEncoding()
return output
}
private func applyLayer(input: MTLBuffer, weights: VisionLayerWeightsE2B,
numPatches: Int, cmdBuf: MTLCommandBuffer) throws -> MTLBuffer {
// This is a placeholder - full implementation needs attention and MLP kernels
// For now, just return input unchanged
return input
}
private func applyEmbeddingProjection(input: MTLBuffer, numPatches: Int,
output: MTLBuffer, cmdBuf: MTLCommandBuffer) throws {
let pso = try engine.pipeline(named: "quantized_matmul_seq")
let enc = cmdBuf.makeComputeCommandEncoder()!
enc.setComputePipelineState(pso)
enc.setBuffer(input, offset: 0, index: 0)
enc.setBuffer(weights.embeddingProjectionWeight, offset: 0, index: 1)
enc.setBuffer(weights.embeddingProjectionScales, offset: 0, index: 2)
enc.setBuffer(weights.embeddingProjectionBiases, offset: 0, index: 3)
enc.setBuffer(output, offset: 0, index: 4)
var inD = UInt32(config.hiddenSize)
enc.setBytes(&inD, length: 4, index: 5)
var outD = UInt32(config.outputProjDims)
enc.setBytes(&outD, length: 4, index: 6)
let grid = MTLSize(width: config.outputProjDims * numPatches, height: 1, depth: 1)
let tg = engine.threadgroupSize1D(pso, count: config.outputProjDims)
enc.dispatchThreads(grid, threadsPerThreadgroup: tg)
enc.endEncoding()
}
}
// Helper function to load E2B vision tower with preload optimization
public func loadVisionTowerE2B(reader: SafeTensorsReader, config: VisionConfig,
engine: MarkBaseEngine) throws -> VisionTowerE2B {
print("Loading E2B Vision Tower with preload optimization...")
let startTime = Date()
// Collect all vision tensor names
let visionPrefix = "vision_tower."
let embedPrefix = "embed_vision."
let visionDescriptors = reader.allDescriptors().filter {
$0.name.hasPrefix(visionPrefix) || $0.name.hasPrefix(embedPrefix)
}
print(" Found \(visionDescriptors.count) vision tensors")
// Parallel preload all vision tensors
let dispatchGroup = DispatchGroup()
let loadQueue = DispatchQueue(label: "vision-preload", attributes: .concurrent)
var loadedData: [Data?] = Array(repeating: nil, count: visionDescriptors.count)
var loadErrors: [Error?] = Array(repeating: nil, count: visionDescriptors.count)
for (idx, desc) in visionDescriptors.enumerated() {
dispatchGroup.enter()
loadQueue.async {
do {
let data = try reader.read(tensor: desc)
loadedData[idx] = data
} catch {
loadErrors[idx] = error
}
dispatchGroup.leave()
}
}
dispatchGroup.wait()
// Check for errors
for (idx, error) in loadErrors.enumerated() {
if let err = error {
throw WeightError.readFailed("Failed to preload vision tensor \(visionDescriptors[idx].name): \(err)")
}
}
let preloadTime = Date().timeIntervalSince(startTime) * 1000
print(" ✓ Parallel preloaded \(visionDescriptors.count) vision tensors in \(String(format: "%.1f", preloadTime))ms")
// Convert to floats/tensors dictionaries (sequential, but from preloaded data)
var floats: [String: [Float]] = [:]
var tensors: [String: Data] = [:]
for (idx, desc) in visionDescriptors.enumerated() {
guard let data = loadedData[idx] else { continue }
let name = desc.name
if desc.dtype == .bf16 {
floats[name] = SafeTensorsReader.bf16ToFloat32(data)
} else if desc.dtype == .u32 {
tensors[name] = data
}
}
let weights = try VisionWeightsE2B(device: engine.device, config: config,
floats: floats, tensors: tensors)
let totalTime = Date().timeIntervalSince(startTime) * 1000
print(" ✓ E2B Vision Tower loaded in \(String(format: "%.1f", totalTime))ms")
return try VisionTowerE2B(config: config, engine: engine, weights: weights)
}