markbaseengine/Sources/MarkBase/Vision/VisionWeights.swift

import Metal

public final class VisionWeights {
    public let inputProj: QuantizedWeights
    public let positionEmbedding: MTLBuffer

    public let embeddingProjectionWeight: MTLBuffer  // uint32 packed
    public let embeddingProjectionScales: MTLBuffer
    public let embeddingProjectionBiases: MTLBuffer

    public let layers: [VisionLayerWeights]

    public init(device: MTLDevice, config: VisionConfig,
                tensors: [String: Data], floats: [String: [Float]]) throws {
        let pfx = "vision_tower.patch_embedder."

        inputProj = try Self.loadQuantized(name: pfx + "input_proj",
                                            tensors: tensors, floats: floats,
                                            device: device,
                                            inDim: config.hiddenSize,
                                            outDim: config.hiddenSize)

        guard let pe = floats[pfx + "position_embedding_table"] else {
            throw WeightError.tensorNotFound("position_embedding_table")
        }
        positionEmbedding = device.makeBuffer(bytes: pe, length: pe.count * 4)!

        // Embedding projection — already quantized
        let ep = "embed_vision.embedding_projection"
        guard let epWeight = tensors[ep + ".weight"] else {
            throw WeightError.tensorNotFound("embedding_projection.weight")
        }
        embeddingProjectionWeight = epWeight.withUnsafeBytes { ptr in
            device.makeBuffer(bytes: ptr.baseAddress!, length: epWeight.count)!
        }
        guard let epScales = floats[ep + ".scales"] else {
            throw WeightError.tensorNotFound("embedding_projection.scales")
        }
        embeddingProjectionScales = device.makeBuffer(
            bytes: epScales, length: epScales.count * 4)!
        guard let epBiases = floats[ep + ".biases"] else {
            throw WeightError.tensorNotFound("embedding_projection.biases")
        }
        embeddingProjectionBiases = device.makeBuffer(
            bytes: epBiases, length: epBiases.count * 4)!
        var loadedLayers: [VisionLayerWeights] = []
        for i in 0..<config.numHiddenLayers {
            loadedLayers.append(try VisionLayerWeights(
                device: device, config: config, layerIdx: i,
                tensors: tensors, floats: floats))
        }
        layers = loadedLayers
    }

    public static func loadQuantized(name: String,
                                       tensors: [String: Data],
                                       floats: [String: [Float]],
                                       device: MTLDevice,
                                       inDim: Int, outDim: Int) throws -> QuantizedWeights {
        let wKey = name + ".weight"
        let sKey = name + ".scales"
        let bKey = name + ".biases"
        guard let wData = tensors[wKey] else {
            throw WeightError.tensorNotFound("Quantized weight \(wKey)")
        }
        guard let sData = floats[sKey] else {
            throw WeightError.tensorNotFound("Quantized scales \(sKey)")
        }
        guard let bData = floats[bKey] else {
            throw WeightError.tensorNotFound("Quantized biases \(bKey)")
        }
        let weight = wData.withUnsafeBytes { ptr in
            device.makeBuffer(bytes: ptr.baseAddress!, length: wData.count)!
        }
        let scales = device.makeBuffer(
            bytes: sData, length: sData.count * 4)!
        let biases = device.makeBuffer(
            bytes: bData, length: bData.count * 4)!
        // Compute groupSize: scales shape is [outDim, numGroups], so numGroups = sData.count / outDim
        let numGroups = sData.count / outDim
        let groupSize = inDim / numGroups
        return QuantizedWeights(weight: weight, scales: scales, biases: biases,
                                inDim: inDim, outDim: outDim, bits: 4, groupSize: groupSize)
    }
}

public struct VisionLayerWeights {
    public let inputLayernorm: MTLBuffer
    public let postAttentionLayernorm: MTLBuffer
    public let preFeedforwardLayernorm: MTLBuffer
    public let postFeedforwardLayernorm: MTLBuffer

    public let selfAttnQProj: QuantizedWeights
    public let selfAttnKProj: QuantizedWeights
    public let selfAttnVProj: QuantizedWeights
    public let selfAttnOProj: QuantizedWeights
    public let qNorm: MTLBuffer
    public let kNorm: MTLBuffer

    public let mlpGateProj: QuantizedWeights
    public let mlpUpProj: QuantizedWeights
    public let mlpDownProj: QuantizedWeights

    public init(device: MTLDevice, config: VisionConfig, layerIdx: Int,
                tensors: [String: Data], floats: [String: [Float]]) throws {
        let prefix = "vision_tower.encoder.layers.\(layerIdx)"
        let h = config.hiddenSize
        let m = config.intermediateSize

        func loadNorm(_ key: String) throws -> MTLBuffer {
            guard let arr = floats[key] else {
                throw WeightError.tensorNotFound("Norm \(key)")
            }
            return device.makeBuffer(bytes: arr, length: arr.count * 4)!
        }

        inputLayernorm = try loadNorm(prefix + ".input_layernorm.weight")
        postAttentionLayernorm = try loadNorm(prefix + ".post_attention_layernorm.weight")
        preFeedforwardLayernorm = try loadNorm(prefix + ".pre_feedforward_layernorm.weight")
        postFeedforwardLayernorm = try loadNorm(prefix + ".post_feedforward_layernorm.weight")

        qNorm = try loadNorm(prefix + ".self_attn.q_norm.weight")
        kNorm = try loadNorm(prefix + ".self_attn.k_norm.weight")

        func q(_ name: String, inDim: Int, outDim: Int) throws -> QuantizedWeights {
            try VisionWeights.loadQuantized(name: prefix + name,
                                             tensors: tensors, floats: floats,
                                             device: device,
                                             inDim: inDim, outDim: outDim)
        }

        selfAttnQProj = try q(".self_attn.q_proj", inDim: h, outDim: h)
        selfAttnKProj = try q(".self_attn.k_proj", inDim: h, outDim: h)
        selfAttnVProj = try q(".self_attn.v_proj", inDim: h, outDim: h)
        selfAttnOProj = try q(".self_attn.o_proj", inDim: h, outDim: h)
        mlpGateProj  = try q(".mlp.gate_proj", inDim: h, outDim: m)
        mlpUpProj    = try q(".mlp.up_proj", inDim: h, outDim: m)
        mlpDownProj  = try q(".mlp.down_proj", inDim: m, outDim: h)
    }
}