markbaseengine/Sources/MarkBase/Vision/VisionTowerE2B.swift

import Metal

// E2B vision tower uses bfloat16 weights (not quantized)
// Linear weights are full bfloat16, converted to float32

public struct VisionLayerWeightsE2B {
    public let inputLayernorm: MTLBuffer
    public let postAttentionLayernorm: MTLBuffer
    public let preFeedforwardLayernorm: MTLBuffer
    public let postFeedforwardLayernorm: MTLBuffer

    public let selfAttnQProj: MTLBuffer
    public let selfAttnKProj: MTLBuffer
    public let selfAttnVProj: MTLBuffer
    public let selfAttnOProj: MTLBuffer
    public let qNorm: MTLBuffer
    public let kNorm: MTLBuffer

    public let mlpGateProj: MTLBuffer
    public let mlpUpProj: MTLBuffer
    public let mlpDownProj: MTLBuffer

    private static func buffer(_ device: MTLDevice, _ floats: [String: [Float]], _ key: String) throws -> MTLBuffer {
        guard let f = floats[key] else {
            throw WeightError.tensorNotFound(key)
        }
        return device.makeBuffer(bytes: f, length: f.count * MemoryLayout<Float>.stride)!
    }

    public init(device: MTLDevice, layerIdx: Int, floats: [String: [Float]]) throws {
        let pfx = "vision_tower.encoder.layers.\(layerIdx)."

        inputLayernorm = try Self.buffer(device, floats, pfx + "input_layernorm.weight")
        postAttentionLayernorm = try Self.buffer(device, floats, pfx + "post_attention_layernorm.weight")
        preFeedforwardLayernorm = try Self.buffer(device, floats, pfx + "pre_feedforward_layernorm.weight")
        postFeedforwardLayernorm = try Self.buffer(device, floats, pfx + "post_feedforward_layernorm.weight")

        qNorm = try Self.buffer(device, floats, pfx + "self_attn.q_norm.weight")
        kNorm = try Self.buffer(device, floats, pfx + "self_attn.k_norm.weight")

        // Linear weights - use .linear.weight suffix for E2B
        selfAttnQProj = try Self.buffer(device, floats, pfx + "self_attn.q_proj.linear.weight")
        selfAttnKProj = try Self.buffer(device, floats, pfx + "self_attn.k_proj.linear.weight")
        selfAttnVProj = try Self.buffer(device, floats, pfx + "self_attn.v_proj.linear.weight")
        selfAttnOProj = try Self.buffer(device, floats, pfx + "self_attn.o_proj.linear.weight")

        mlpGateProj = try Self.buffer(device, floats, pfx + "mlp.gate_proj.linear.weight")
        mlpUpProj = try Self.buffer(device, floats, pfx + "mlp.up_proj.linear.weight")
        mlpDownProj = try Self.buffer(device, floats, pfx + "mlp.down_proj.linear.weight")
    }
}

public struct VisionWeightsE2B {
    public let inputProjWeight: MTLBuffer
    public let positionEmbedding: MTLBuffer

    public let embeddingProjectionWeight: MTLBuffer
    public let embeddingProjectionScales: MTLBuffer
    public let embeddingProjectionBiases: MTLBuffer

    public let layers: [VisionLayerWeightsE2B]

    private static func buffer(_ device: MTLDevice, _ floats: [String: [Float]], _ key: String) throws -> MTLBuffer {
        guard let f = floats[key] else {
            throw WeightError.tensorNotFound(key)
        }
        return device.makeBuffer(bytes: f, length: f.count * MemoryLayout<Float>.stride)!
    }

    public init(device: MTLDevice, config: VisionConfig, floats: [String: [Float]], tensors: [String: Data]) throws {
        let pfx = "vision_tower.patch_embedder."

        inputProjWeight = try Self.buffer(device, floats, pfx + "input_proj.weight")
        positionEmbedding = try Self.buffer(device, floats, pfx + "position_embedding_table")

        // Embedding projection - uint32 quantized (same as E4B)
        let ep = "embed_vision.embedding_projection"
        guard let epWeightData = tensors[ep + ".weight"] else {
            throw WeightError.tensorNotFound("embedding_projection.weight")
        }
        embeddingProjectionWeight = epWeightData.withUnsafeBytes { ptr in
            device.makeBuffer(bytes: ptr.baseAddress!, length: epWeightData.count)!
        }
        embeddingProjectionScales = try Self.buffer(device, floats, ep + ".scales")
        embeddingProjectionBiases = try Self.buffer(device, floats, ep + ".biases")

        var loadedLayers: [VisionLayerWeightsE2B] = []
        for i in 0..<config.numHiddenLayers {
            loadedLayers.append(try VisionLayerWeightsE2B(device: device, layerIdx: i, floats: floats))
        }
        layers = loadedLayers
    }
}

public final class VisionTowerE2B {
    public let config: VisionConfig
    public let engine: MarkBaseEngine
    public let weights: VisionWeightsE2B

    private var qBuffer: MTLBuffer
    private var kBuffer: MTLBuffer
    private var vBuffer: MTLBuffer
    private var attnOutBuffer: MTLBuffer
    private var mlpBuffer: MTLBuffer
    private var tempBuffer: MTLBuffer
    private var normBuffer: MTLBuffer
    private var residualBuffer: MTLBuffer

    public init(config: VisionConfig, engine: MarkBaseEngine, weights: VisionWeightsE2B) throws {
        self.config = config
        self.engine = engine
        self.weights = weights

        let device = engine.device
        let maxPatches = 4096
        let hiddenSize = config.hiddenSize
        let intermediateSize = config.intermediateSize

        qBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
        kBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
        vBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
        attnOutBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
        mlpBuffer = device.makeBuffer(length: intermediateSize * maxPatches * 4)!
        tempBuffer = device.makeBuffer(length: max(hiddenSize, intermediateSize) * maxPatches * 4)!
        normBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
        residualBuffer = device.makeBuffer(length: hiddenSize * maxPatches * 4)!
    }

    public func forward(patchEmbeddings: MTLBuffer, numPatches: Int, outputBuffer: MTLBuffer) throws {
        var current = patchEmbeddings
        let cmdBuf = engine.commandQueue.makeCommandBuffer()!

        // Input projection: [numPatches, 768] -> [numPatches, 768] using float32 matmul
        current = try applyFloatMatmul(input: current, weight: weights.inputProjWeight,
                                        inDim: config.hiddenSize, outDim: config.hiddenSize,
                                        seqLen: numPatches, output: tempBuffer, cmdBuf: cmdBuf)

        // Add position embedding
        current = try addPositionEmbedding(input: current, numPatches: numPatches, cmdBuf: cmdBuf)

        // Vision layers (16 layers)
        for layerWeights in weights.layers {
            current = try applyLayer(input: current, weights: layerWeights, numPatches: numPatches, cmdBuf: cmdBuf)
        }

        // Embedding projection: quantized matmul [numPatches, 768] -> [numPatches, 2560]
        try applyEmbeddingProjection(input: current, numPatches: numPatches, output: outputBuffer, cmdBuf: cmdBuf)

        cmdBuf.commit()
        cmdBuf.waitUntilCompleted()
    }

    private func applyFloatMatmul(input: MTLBuffer, weight: MTLBuffer,
                                   inDim: Int, outDim: Int, seqLen: Int,
                                   output: MTLBuffer, cmdBuf: MTLCommandBuffer) throws -> MTLBuffer {
        // Use quantized_matmul_seq with float32 weights (no scales/biases needed)
        // For float32, we can use a simple matmul kernel
        let pso = try engine.pipeline(named: "quantized_matmul_seq")
        let enc = cmdBuf.makeComputeCommandEncoder()!
        enc.setComputePipelineState(pso)

        enc.setBuffer(input, offset: 0, index: 0)
        enc.setBuffer(weight, offset: 0, index: 1)
        // For float32 matmul, we need dummy scales/biases
        let dummyScales = engine.device.makeBuffer(length: outDim * 4)!
        let dummyBiases = engine.device.makeBuffer(length: outDim * 4)!
        enc.setBuffer(dummyScales, offset: 0, index: 2)
        enc.setBuffer(dummyBiases, offset: 0, index: 3)
        enc.setBuffer(output, offset: 0, index: 4)

        var inD = UInt32(inDim)
        enc.setBytes(&inD, length: 4, index: 5)
        var outD = UInt32(outDim)
        enc.setBytes(&outD, length: 4, index: 6)

        let grid = MTLSize(width: outDim * seqLen, height: 1, depth: 1)
        let tg = engine.threadgroupSize1D(pso, count: outDim)
        enc.dispatchThreads(grid, threadsPerThreadgroup: tg)
        enc.endEncoding()

        return output
    }

    private func addPositionEmbedding(input: MTLBuffer, numPatches: Int, cmdBuf: MTLCommandBuffer) throws -> MTLBuffer {
        let output = normBuffer
        let pso = try engine.pipeline(named: "vision_add_pos_embed")
        let enc = cmdBuf.makeComputeCommandEncoder()!
        enc.setComputePipelineState(pso)

        enc.setBuffer(input, offset: 0, index: 0)
        enc.setBuffer(weights.positionEmbedding, offset: 0, index: 1)
        enc.setBuffer(output, offset: 0, index: 2)

        var hd = UInt32(config.hiddenSize)
        enc.setBytes(&hd, length: 4, index: 3)
        var np = UInt32(numPatches)
        enc.setBytes(&np, length: 4, index: 4)

        let grid = MTLSize(width: config.hiddenSize, height: numPatches, depth: 1)
        let tg = engine.threadgroupSize2D(pso, grid: (config.hiddenSize, numPatches))
        enc.dispatchThreads(grid, threadsPerThreadgroup: tg)
        enc.endEncoding()

        return output
    }

    private func applyLayer(input: MTLBuffer, weights: VisionLayerWeightsE2B,
                            numPatches: Int, cmdBuf: MTLCommandBuffer) throws -> MTLBuffer {
        // This is a placeholder - full implementation needs attention and MLP kernels
        // For now, just return input unchanged
        return input
    }

    private func applyEmbeddingProjection(input: MTLBuffer, numPatches: Int,
                                          output: MTLBuffer, cmdBuf: MTLCommandBuffer) throws {
        let pso = try engine.pipeline(named: "quantized_matmul_seq")
        let enc = cmdBuf.makeComputeCommandEncoder()!
        enc.setComputePipelineState(pso)

        enc.setBuffer(input, offset: 0, index: 0)
        enc.setBuffer(weights.embeddingProjectionWeight, offset: 0, index: 1)
        enc.setBuffer(weights.embeddingProjectionScales, offset: 0, index: 2)
        enc.setBuffer(weights.embeddingProjectionBiases, offset: 0, index: 3)
        enc.setBuffer(output, offset: 0, index: 4)

        var inD = UInt32(config.hiddenSize)
        enc.setBytes(&inD, length: 4, index: 5)
        var outD = UInt32(config.outputProjDims)
        enc.setBytes(&outD, length: 4, index: 6)

        let grid = MTLSize(width: config.outputProjDims * numPatches, height: 1, depth: 1)
        let tg = engine.threadgroupSize1D(pso, count: config.outputProjDims)
        enc.dispatchThreads(grid, threadsPerThreadgroup: tg)
        enc.endEncoding()
    }
}

// Helper function to load E2B vision tower with preload optimization
public func loadVisionTowerE2B(reader: SafeTensorsReader, config: VisionConfig,
                               engine: MarkBaseEngine) throws -> VisionTowerE2B {
    print("Loading E2B Vision Tower with preload optimization...")
    let startTime = Date()

    // Collect all vision tensor names
    let visionPrefix = "vision_tower."
    let embedPrefix = "embed_vision."
    let visionDescriptors = reader.allDescriptors().filter {
        $0.name.hasPrefix(visionPrefix) || $0.name.hasPrefix(embedPrefix)
    }

    print("  Found \(visionDescriptors.count) vision tensors")

    // Parallel preload all vision tensors
    let dispatchGroup = DispatchGroup()
    let loadQueue = DispatchQueue(label: "vision-preload", attributes: .concurrent)
    var loadedData: [Data?] = Array(repeating: nil, count: visionDescriptors.count)
    var loadErrors: [Error?] = Array(repeating: nil, count: visionDescriptors.count)

    for (idx, desc) in visionDescriptors.enumerated() {
        dispatchGroup.enter()
        loadQueue.async {
            do {
                let data = try reader.read(tensor: desc)
                loadedData[idx] = data
            } catch {
                loadErrors[idx] = error
            }
            dispatchGroup.leave()
        }
    }

    dispatchGroup.wait()

    // Check for errors
    for (idx, error) in loadErrors.enumerated() {
        if let err = error {
            throw WeightError.readFailed("Failed to preload vision tensor \(visionDescriptors[idx].name): \(err)")
        }
    }

    let preloadTime = Date().timeIntervalSince(startTime) * 1000
    print("  ✓ Parallel preloaded \(visionDescriptors.count) vision tensors in \(String(format: "%.1f", preloadTime))ms")

    // Convert to floats/tensors dictionaries (sequential, but from preloaded data)
    var floats: [String: [Float]] = [:]
    var tensors: [String: Data] = [:]

    for (idx, desc) in visionDescriptors.enumerated() {
        guard let data = loadedData[idx] else { continue }
        let name = desc.name
        if desc.dtype == .bf16 {
            floats[name] = SafeTensorsReader.bf16ToFloat32(data)
        } else if desc.dtype == .u32 {
            tensors[name] = data
        }
    }

    let weights = try VisionWeightsE2B(device: engine.device, config: config,
                                       floats: floats, tensors: tensors)

    let totalTime = Date().timeIntervalSince(startTime) * 1000
    print("  ✓ E2B Vision Tower loaded in \(String(format: "%.1f", totalTime))ms")

    return try VisionTowerE2B(config: config, engine: engine, weights: weights)
}