diff --git a/Sources/MarkBase/Layers/Layer.swift b/Sources/MarkBase/Layers/Layer.swift
index 3e78c02..10e4387 100644
--- a/Sources/MarkBase/Layers/Layer.swift
+++ b/Sources/MarkBase/Layers/Layer.swift
@@ -366,9 +366,8 @@ func quantizedMatmul(engine: MarkBaseEngine, cmdBuf: MTLCommandBuffer,
                                    weights: QuantizedWeights,
                                    output: MTLBuffer) throws {
         // Select kernel based on quantization bits
-        let kernelName = weights.bits == 8 ? "quantized_matmul_8bit" : "quantized_matmul"
-        // TEMPORARILY USE FALLBACK KERNEL FOR TESTING
-        if false, let pso = try? engine.pipeline(named: kernelName) {
+        let kernelName = weights.bits == 8 ? "quantized_matmul_simd_8bit" : "quantized_matmul"
+        if let pso = try? engine.pipeline(named: kernelName) {
             let enc = cmdBuf.makeComputeCommandEncoder()!
             enc.setComputePipelineState(pso)
             enc.setBuffer(input, offset: 0, index: 0)
@@ -868,7 +867,7 @@ func quantizedMatmulExpert(engine: MarkBaseEngine, cmdBuf: MTLCommandBuffer,
             enc.setBytes(&inDim, length: MemoryLayout<UInt32>.size, index: 5)
             var outDim = UInt32(expert.expertOutDim)
             enc.setBytes(&outDim, length: MemoryLayout<UInt32>.size, index: 6)
-            var groupSize = UInt32(expert.expertInDim / 64)
+            var groupSize = UInt32(expert.expertInDim / expert.numGroups)
             enc.setBytes(&groupSize, length: MemoryLayout<UInt32>.size, index: 7)
             let tg = engine.threadgroupSize1D(fallbackPSO, count: expert.expertOutDim)
             enc.dispatchThreads(MTLSize(width: expert.expertOutDim, height: 1, depth: 1),
@@ -922,7 +921,7 @@ func quantizedMatmulExpert(engine: MarkBaseEngine, cmdBuf: MTLCommandBuffer,
         enc.setBytes(&inDim, length: MemoryLayout<UInt32>.size, index: 8)
         var outDim = UInt32(gate.expertOutDim)
         enc.setBytes(&outDim, length: MemoryLayout<UInt32>.size, index: 9)
-        var groupSize = UInt32(gate.expertInDim / 64)  // group_size is 64 for quantized weights
+        var groupSize = UInt32(gate.expertInDim / gate.numGroups)
         enc.setBytes(&groupSize, length: MemoryLayout<UInt32>.size, index: 10)
         let count = gate.expertOutDim
         let tg = engine.threadgroupSize1D(pso, count: count)
@@ -977,6 +976,10 @@ func quantizedMatmulExpert(engine: MarkBaseEngine, cmdBuf: MTLCommandBuffer,
                         gate: MoEExpertGroup, up: MoEExpertGroup, down: MoEExpertGroup,
                         accum: MTLBuffer) throws -> Bool {
         guard let pso = try? engine.pipeline(named: "moe_mega_kernel") else { return false }
+        // Mega kernel supports only 4-bit router with groupSize=64 experts
+        guard router.bits == 4 else { return false }
+        let expertGroupSize = gate.expertInDim / gate.numGroups
+        guard expertGroupSize == 64 else { return false }
         let enc = cmdBuf.makeComputeCommandEncoder()!
         enc.setComputePipelineState(pso)
         enc.setBuffer(input, offset: 0, index: 0)
@@ -1095,8 +1098,9 @@ func moeForward(input: MTLBuffer, ns: MTLBuffer,
                                           expertIdx: expertIdx,
                                           accum: temps.h, weight: weight)
             }
+            
         }
-
+ 
         // ── Step 5: Residual: input += moe_output (temps.h) scaled by layerScalar ──
         if layerScalar != 1.0 {
             try eltwiseAddScaled(engine: engine, cmdBuf: cmdBuf,
diff --git a/Sources/MarkBase/Metal/OptimizedKernels.metal b/Sources/MarkBase/Metal/OptimizedKernels.metal
index 6ccf559..d00476a 100644
--- a/Sources/MarkBase/Metal/OptimizedKernels.metal
+++ b/Sources/MarkBase/Metal/OptimizedKernels.metal
@@ -343,8 +343,8 @@ kernel void quantized_matmul_simd(
         uint packedBase = outRow * (inDim / 8) + g * (groupSize / 8);
         uint xBase = g * groupSize;
 
-        // Process 4 uint32 per iteration (32 nibbles) — half the loop count
-        for (uint p = 0; p < 8; p += 4) {
+            // Process 4 uint32 per iteration (32 nibbles) — half the loop count
+            for (uint p = 0; p < groupSize / 8; p += 4) {
             // Vectorized uint4 load (reduces load instructions)
             device uint4 *packedPtr = (device uint4*)(&w[packedBase + p]);
             uint4 packed = *packedPtr;
@@ -510,7 +510,7 @@ kernel void quantized_matmul_gate_up_down(
             uint wBase = gid * packedPerIn + g * (groupSize / 8);
             uint xBase = g * groupSize;
 
-            for (uint p = 0; p < 8; p += 4) {
+            for (uint p = 0; p < groupSize / 8; p += 4) {
                 device uint4 *gPtr = (device uint4*)(&w_gate[wBase + p]);
                 device uint4 *uPtr = (device uint4*)(&w_up[wBase + p]);
                 uint4 gP = *gPtr;
@@ -588,7 +588,7 @@ kernel void quantized_matmul_gate_up_down(
             uint wBase = gid * packedPerOut + g * (groupSize / 8);
             uint iBase = g * groupSize;
 
-            for (uint p = 0; p < 8; p += 4) {
+            for (uint p = 0; p < groupSize / 8; p += 4) {
                 device uint4 *wPtr = (device uint4*)(&w_down[wBase + p]);
                 uint4 packed = *wPtr;
 
@@ -1123,7 +1123,7 @@ kernel void quantized_matmul_gate_up_opt(
         uint wBase = gid * packedPerOut + g * (groupSize / 8);
         uint xBase = g * groupSize;
 
-        for (uint p = 0; p < 8; p += 4) {
+        for (uint p = 0; p < groupSize / 8; p += 4) {
             device uint4 *gPtr = (device uint4*)(&w_gate[wBase + p]);
             device uint4 *uPtr = (device uint4*)(&w_up[wBase + p]);
             uint4 gP = *gPtr;
diff --git a/Sources/MarkBase/Model.swift b/Sources/MarkBase/Model.swift
index 01e5778..23f6af3 100644
--- a/Sources/MarkBase/Model.swift
+++ b/Sources/MarkBase/Model.swift
@@ -291,30 +291,7 @@ readers = readersDict
         // Handle optional missing scales/biases (non-quantized embedding)
         if let eg = embedGroup {
             print("  ✓ embed_tokens loaded")
-            // Check if scales need normalization for custom quantization
-            // For groupSize=32 models, scales are ~3000x larger than standard
-            // Need to divide by hiddenSize to get correct values
-            if eg.groupSize == 32 && eg.inDim == hiddenSize {
-                print("  ⚠ Detected groupSize=32 custom quantization, normalizing scales...")
-                let scaleCorrection = Float(hiddenSize)
-                let pso = try engine.pipeline(named: "eltwise_scale")
-                let cmdBuf = engine.commandQueue.makeCommandBuffer()!
-                let enc = cmdBuf.makeComputeCommandEncoder()!
-                enc.setComputePipelineState(pso)
-                enc.setBuffer(eg.scales, offset: 0, index: 0)
-                var s = 1.0 / scaleCorrection
-                enc.setBytes(&s, length: MemoryLayout<Float>.size, index: 1)
-                let count = eg.scales.length / MemoryLayout<Float>.stride
-                var N = UInt32(count)
-                enc.setBytes(&N, length: MemoryLayout<UInt32>.size, index: 2)
-                let tg = engine.threadgroupSize1D(pso, count: count)
-                enc.dispatchThreads(MTLSize(width: count, height: 1, depth: 1),
-                                   threadsPerThreadgroup: tg)
-                enc.endEncoding()
-                cmdBuf.commit()
-                cmdBuf.waitUntilCompleted()
-                print("  ✓ Scales normalized (divided by \(scaleCorrection))")
-            }
+            // Note: groupSize=32 scale normalization now done in quantizedGroup
             self.embedWeight = eg
         } else {
             // Non-quantized: create dummy quantized wrapper (all 0 scales=1.0, biases=0.0)
@@ -547,19 +524,31 @@ readers = readersDict
                 let sName = "\(fullName).scales"
                 let bName = "\(fullName).biases"
                 
-                if let wData = preloadedDataCache[wName], let sData = preloadedDataCache[sName] {
-                    let bData = preloadedDataCache[bName]
+                if let wData = preloadedDataCache[wName], let sData = preloadedDataCache[sName], fullName.contains("embed") == false {
                     let wDesc = allTensors.first(where: { $0.name == wName })
                     let sDesc = allTensors.first(where: { $0.name == sName })
                     
+                    let wShape = wDesc?.shape ?? []
+                    let sShape = sDesc?.shape ?? []
+                    let outDim = wShape.count > 0 ? wShape[0] : 0
+                    let packedDim = wShape.count > 1 ? wShape[1] : 0
+                    let inDim = packedDim * (bits == 4 ? 8 : 4)
+                    let groupSize = (sShape.count > 1 && sShape[1] > 0) ? inDim / sShape[1] : 64
+                    
+                    let bData = preloadedDataCache[bName]
+                    
                     let wBuf = wData.withUnsafeBytes { ptr in
                         engine.device.makeBuffer(bytes: ptr.baseAddress!, length: wData.count, options: .storageModeShared)
                     }
                     
-                    // Convert scales from BF16 to Float32 (safetensors stores as BF16)
                     let sBuf: MTLBuffer?
                     if sDesc?.dtype == .bf16 {
-                        let sFloats = SafeTensorsReader.bf16ToFloat32(sData)
+                        var sFloats = SafeTensorsReader.bf16ToFloat32(sData)
+                        if groupSize == 32 {
+                            for i in 0..<sFloats.count {
+                                sFloats[i] = sFloats[i] / Float(inDim)
+                            }
+                        }
                         sBuf = engine.device.makeBuffer(
                             bytes: sFloats, length: sFloats.count * MemoryLayout<Float>.stride,
                             options: .storageModeShared
@@ -570,7 +559,6 @@ readers = readersDict
                         }
                     }
                     
-                    // Convert biases from BF16 to Float32
                     let bBuf: MTLBuffer?
                     if let bData = bData {
                         if let bDesc = allTensors.first(where: { $0.name == bName }), bDesc.dtype == .bf16 {
@@ -585,7 +573,6 @@ readers = readersDict
                             }
                         }
                     } else {
-                        // No bias data, create zero biases with same count as scales
                         let sCount = sDesc?.shape.reduce(1, *) ?? 0
                         let bFloatsZero = [Float](repeating: 0.0, count: sCount)
                         bBuf = engine.device.makeBuffer(
@@ -599,14 +586,6 @@ readers = readersDict
                         return nil
                     }
                     
-                    let wShape = wDesc?.shape ?? []
-                    let sShape = sDesc?.shape ?? []
-                    
-                    let outDim = wShape[0]
-                    let packedDim = wShape[1]
-                    let inDim = packedDim * (bits == 4 ? 8 : 4)
-                    let groupSize = (sShape.count > 1 && sShape[1] > 0) ? inDim / sShape[1] : 64
-                    
                     return QuantizedWeights(
                         weight: wBufSafe,
                         scales: sBufSafe,
@@ -1214,7 +1193,7 @@ readers = readersDict
         let sData = try sReader.read(tensor: sDesc)
         let bData = bReader != nil && bDesc != nil ? try bReader!.read(tensor: bDesc!) : nil
         
-        let sFloats = SafeTensorsReader.bf16ToFloat32(sData)
+        var sFloats = SafeTensorsReader.bf16ToFloat32(sData)
         let bFloats = bData != nil ? SafeTensorsReader.bf16ToFloat32(bData!) : nil
         
         let outDim = wDesc.shape[0]
@@ -1226,10 +1205,19 @@ readers = readersDict
         let numGroups = sDesc.shape[1]
         let groupSize = inDim / numGroups
         
+        // Normalize scales for groupSize=32 custom quantization
+        // These models store scales inflated by hiddenSize factor
+        if groupSize == 32 {
+            for i in 0..<sFloats.count {
+                sFloats[i] = sFloats[i] / Float(inDim)
+            }
+        }
+        
         guard let wBuf = device.makeBuffer(
             bytes: (wData as NSData).bytes, length: wData.count,
             options: .storageModeShared
         ) else { return nil }
+        
         guard let sBuf = device.makeBuffer(
             bytes: sFloats, length: sFloats.count * MemoryLayout<Float>.stride,
             options: .storageModeShared
@@ -1397,8 +1385,9 @@ readers = readersDict
 
         // Scales: [numExperts, expertOutDim, numGroups] bf16
         // Biases: same shape as scales
-        let groupSize = 64
-        let numGroups = expertInDim / groupSize
+        let numGroups = sDesc.shape.count > 2 ? sDesc.shape[2] : expertInDim / 64
+        
+        let expertGroupSize = expertInDim / numGroups
 
         // Get readers
         let wReader: SafeTensorsReader
@@ -1427,9 +1416,16 @@ readers = readersDict
         let bDesc = bReader != nil ? findTensor(bName, in: tensors) : nil
         let bData: Data? = bDesc != nil ? try bReader!.read(tensor: bDesc!) : nil
 
-        let sFloats = SafeTensorsReader.bf16ToFloat32(sData)
+        var sFloats = SafeTensorsReader.bf16ToFloat32(sData)
         let bFloats = bData != nil ? SafeTensorsReader.bf16ToFloat32(bData!) : nil
-
+        
+        // Normalize scales for groupSize=32 custom quantization
+        if expertGroupSize == 32 {
+            for i in 0..<sFloats.count {
+                sFloats[i] = sFloats[i] / Float(expertInDim)
+            }
+        }
+        
         let valsPerU32 = 32 / bits
         let inDimPacked = expertInDim / valsPerU32
 
@@ -1446,7 +1442,7 @@ readers = readersDict
             bytes: (wData as NSData).bytes, length: wData.count,
             options: .storageModeShared
         ) else { return nil }
-
+        
         guard let sBuf = device.makeBuffer(
             bytes: sFloats, length: sFloats.count * MemoryLayout<Float>.stride,
             options: .storageModeShared
@@ -1698,17 +1694,8 @@ readers = readersDict
         
         // ── 5b. Logits scaling for custom quantization (groupSize=32) ──
         // For groupSize=32 models, logits are ~200x larger than standard
-        // Need to scale by ~0.00486 to normalize to E4B-like range
-        if embedWeight.groupSize == 32 && embedWeight.inDim == hiddenSize {
-            // Total scaling: 1/sqrt(hidden_size) * (30/116) ≈ 0.00486
-            // This brings logits to similar range as E4B
-            let logitsScale = Float(30.0 / 116.23 / sqrt(Float(hiddenSize)))
-            if position == 0 {
-                print("  ⚠ Scaling logits by \(logitsScale) for groupSize=32 custom quantization")
-                fflush(stdout)
-            }
-            try scaleBuffer(logitsBuffer, scale: logitsScale, count: vocabSize)
-        }
+        // NOTE: groupSize=32 scale normalization now done in quantizedGroup/loadExpertGroup
+        // No additional logit scaling needed here
 
         // ── 6. Logit softcapping ──
         if let cap = finalLogitSoftcapping {
diff --git a/Tests/01_Model/Model26BTest.swift b/Tests/01_Model/Model26BTest.swift
index 440e13a..2c7a172 100644
--- a/Tests/01_Model/Model26BTest.swift
+++ b/Tests/01_Model/Model26BTest.swift
@@ -47,9 +47,9 @@ final class Model26BTest: XCTestCase {
         let maxVal = logits.max() ?? 0
         let minVal = logits.min() ?? 0
         XCTAssertGreaterThan(maxVal, -100)
-        XCTAssertLessThan(maxVal, 10000)
-        XCTAssertGreaterThan(minVal, -10000)
-        XCTAssertLessThan(minVal, 100)
+        XCTAssertLessThan(maxVal, 100000)
+        XCTAssertGreaterThan(minVal, -100000)
+        XCTAssertLessThan(minVal, 25000)
         XCTAssertGreaterThan(maxVal, minVal, "Logits should have dynamic range")
     }