markbaseengine/Sources/MarkBase/Audio/AudioWeights.swift

import Metal
import Foundation

public final class AudioWeights {
    public let subsampleConvLayer0: SubsampleConvLayer
    public let subsampleConvLayer1: SubsampleConvLayer
    public let inputProjLinearWeight: MTLBuffer  // Float32, not quantized

    public let outputProj: QuantizedWeights
    public let outputProjBias: MTLBuffer

    public let layers: [AudioLayerWeights]

    public init(device: MTLDevice, config: AudioConfig,
                tensors: [String: Data], floats: [String: [Float]],
                descriptors: [String: TensorDescriptor]) throws {
        let P = "audio_tower."

        subsampleConvLayer0 = SubsampleConvLayer(
            convWeight: try Self.buffer(device, floats, P + "subsample_conv_projection.layer0.conv.weight"),
            normWeight: try Self.buffer(device, floats, P + "subsample_conv_projection.layer0.norm.weight")
        )

        subsampleConvLayer1 = SubsampleConvLayer(
            convWeight: try Self.buffer(device, floats, P + "subsample_conv_projection.layer1.conv.weight"),
            normWeight: try Self.buffer(device, floats, P + "subsample_conv_projection.layer1.norm.weight")
        )

        inputProjLinearWeight = try Self.buffer(device, floats, P + "subsample_conv_projection.input_proj_linear.weight")

        outputProj = try Self.loadQuantized(device: device, tensors: tensors, floats: floats,
                                             descriptors: descriptors,
                                             name: P + "output_proj")
        outputProjBias = try Self.buffer(device, floats, P + "output_proj.bias")

        var loadedLayers: [AudioLayerWeights] = []
        for i in 0..<config.numHiddenLayers {
            loadedLayers.append(try AudioLayerWeights(device: device, layerIdx: i,
                                                       tensors: tensors, floats: floats,
                                                       descriptors: descriptors))
        }
        layers = loadedLayers
    }

    // ── Helpers ──

    private static func buffer(_ device: MTLDevice, _ floats: [String: [Float]],
                                _ key: String) throws -> MTLBuffer {
        guard let f = floats[key] else {
            throw WeightError.tensorNotFound(key)
        }
        guard let buf = device.makeBuffer(bytes: f, length: f.count * MemoryLayout<Float>.stride) else {
            throw WeightError.tensorNotFound("Failed to create buffer for \(key)")
        }
        return buf
    }

    static func loadQuantized(device: MTLDevice, tensors: [String: Data],
                               floats: [String: [Float]],
                               descriptors: [String: TensorDescriptor],
                               name: String) throws -> QuantizedWeights {
        let wName = name + ".weight"
        let sName = name + ".scales"
        let bName = name + ".biases"

        guard let wData = tensors[wName],
              let sFloats = floats[sName],
              let bFloats = floats[bName],
              let wDesc = descriptors[wName],
              let sDesc = descriptors[sName] else {
            throw WeightError.tensorNotFound(name)
        }

        // Dimensions from descriptors:
        // weight: [outDim, inDim/8] (U32 packed, 8 values per U32)
        // scales: [outDim, numGroups] where numGroups = inDim / groupSize
        let outDim = wDesc.shape[0]
        let numGroups = sDesc.shape[1]
        let groupSize = 64  // Audio uses fixed group_size=64
        let inDim = numGroups * groupSize

        guard let wBuf = device.makeBuffer(bytes: (wData as NSData).bytes, length: wData.count,
                                            options: .storageModeShared) else {
            throw WeightError.bufferCreationFailed(wName)
        }
        guard let sBuf = device.makeBuffer(bytes: sFloats, length: sFloats.count * MemoryLayout<Float>.stride,
                                            options: .storageModeShared) else {
            throw WeightError.bufferCreationFailed(sName)
        }
        guard let bBuf = device.makeBuffer(bytes: bFloats, length: bFloats.count * MemoryLayout<Float>.stride,
                                            options: .storageModeShared) else {
            throw WeightError.bufferCreationFailed(bName)
        }

        return QuantizedWeights(weight: wBuf, scales: sBuf, biases: bBuf,
                                inDim: inDim, outDim: outDim, bits: 4, groupSize: groupSize)
    }
}

public struct SubsampleConvLayer {
    public let convWeight: MTLBuffer
    public let normWeight: MTLBuffer
}

public struct AudioLayerWeights {
    public let normPreAttn: MTLBuffer
    public let normPostAttn: MTLBuffer
    public let normOut: MTLBuffer

    public let selfAttnQProj: QuantizedWeights
    public let selfAttnKProj: QuantizedWeights
    public let selfAttnVProj: QuantizedWeights
    public let selfAttnPost: QuantizedWeights
    public let selfAttnRelativeKProj: MTLBuffer
    public let selfAttnPerDimScale: MTLBuffer

    public let lconv1dPreLayerNorm: MTLBuffer
    public let lconv1dConvNorm: MTLBuffer
    public let lconv1dDepthwiseConv: MTLBuffer
    public let lconv1dLinearStart: QuantizedWeights
    public let lconv1dLinearEnd: QuantizedWeights

    public let feedForward1: FeedForwardWeights
    public let feedForward2: FeedForwardWeights

    private static func buffer(_ device: MTLDevice, _ floats: [String: [Float]],
                                _ key: String) throws -> MTLBuffer {
        guard let f = floats[key] else {
            throw WeightError.tensorNotFound(key)
        }
        guard let buf = device.makeBuffer(bytes: f, length: f.count * MemoryLayout<Float>.stride) else {
            throw WeightError.tensorNotFound("Failed to create buffer for \(key)")
        }
        return buf
    }

    public init(device: MTLDevice, layerIdx: Int,
                tensors: [String: Data], floats: [String: [Float]],
                descriptors: [String: TensorDescriptor]) throws {
        let P = "audio_tower.layers.\(layerIdx)."

        normPreAttn = try Self.buffer(device, floats, P + "norm_pre_attn.weight")
        normPostAttn = try Self.buffer(device, floats, P + "norm_post_attn.weight")
        normOut = try Self.buffer(device, floats, P + "norm_out.weight")

        selfAttnQProj = try AudioWeights.loadQuantized(device: device, tensors: tensors, floats: floats,
                                                        descriptors: descriptors,
                                                        name: P + "self_attn.q_proj")
        selfAttnKProj = try AudioWeights.loadQuantized(device: device, tensors: tensors, floats: floats,
                                                        descriptors: descriptors,
                                                        name: P + "self_attn.k_proj")
        selfAttnVProj = try AudioWeights.loadQuantized(device: device, tensors: tensors, floats: floats,
                                                        descriptors: descriptors,
                                                        name: P + "self_attn.v_proj")
        selfAttnPost = try AudioWeights.loadQuantized(device: device, tensors: tensors, floats: floats,
                                                       descriptors: descriptors,
                                                       name: P + "self_attn.post")

        selfAttnRelativeKProj = try Self.buffer(device, floats, P + "self_attn.relative_k_proj.weight")
        selfAttnPerDimScale = try Self.buffer(device, floats, P + "self_attn.per_dim_scale")

        lconv1dPreLayerNorm = try Self.buffer(device, floats, P + "lconv1d.pre_layer_norm.weight")
        lconv1dConvNorm = try Self.buffer(device, floats, P + "lconv1d.conv_norm.weight")
        lconv1dDepthwiseConv = try Self.buffer(device, floats, P + "lconv1d.depthwise_conv1d.weight")

        lconv1dLinearStart = try AudioWeights.loadQuantized(device: device, tensors: tensors, floats: floats,
                                                             descriptors: descriptors,
                                                             name: P + "lconv1d.linear_start")
        lconv1dLinearEnd = try AudioWeights.loadQuantized(device: device, tensors: tensors, floats: floats,
                                                           descriptors: descriptors,
                                                           name: P + "lconv1d.linear_end")

        feedForward1 = try FeedForwardWeights(device: device, prefix: P + "feed_forward1",
                                               tensors: tensors, floats: floats,
                                               descriptors: descriptors)
        feedForward2 = try FeedForwardWeights(device: device, prefix: P + "feed_forward2",
                                               tensors: tensors, floats: floats,
                                               descriptors: descriptors)
    }
}

public struct FeedForwardWeights {
    public let preLayerNorm: MTLBuffer
    public let postLayerNorm: MTLBuffer
    public let ffwLayer1: QuantizedWeights
    public let ffwLayer2: QuantizedWeights

    public init(device: MTLDevice, prefix: String,
                tensors: [String: Data], floats: [String: [Float]],
                descriptors: [String: TensorDescriptor]) throws {
        let b = { (key: String) throws -> MTLBuffer in
            guard let f = floats[key] else { throw WeightError.tensorNotFound(key) }
            guard let buf = device.makeBuffer(bytes: f, length: f.count * MemoryLayout<Float>.stride) else {
                throw WeightError.tensorNotFound("Failed to create buffer for \(key)")
            }
            return buf
        }

        preLayerNorm = try b(prefix + ".pre_layer_norm.weight")
        postLayerNorm = try b(prefix + ".post_layer_norm.weight")

        ffwLayer1 = try AudioWeights.loadQuantized(device: device, tensors: tensors, floats: floats,
                                                    descriptors: descriptors,
                                                    name: prefix + ".ffw_layer_1")
        ffwLayer2 = try AudioWeights.loadQuantized(device: device, tensors: tensors, floats: floats,
                                                    descriptors: descriptors,
                                                    name: prefix + ".ffw_layer_2")
    }
}